1/* 2 * Implement fast SHA-1 with AVX2 instructions. (x86_64) 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2014 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Ilya Albrekht <ilya.albrekht@intel.com> 22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com> 23 * Ronen Zohar <ronen.zohar@intel.com> 24 * Chandramouli Narayanan <mouli@linux.intel.com> 25 * 26 * BSD LICENSE 27 * 28 * Copyright(c) 2014 Intel Corporation. 29 * 30 * Redistribution and use in source and binary forms, with or without 31 * modification, are permitted provided that the following conditions 32 * are met: 33 * 34 * Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in 38 * the documentation and/or other materials provided with the 39 * distribution. 40 * Neither the name of Intel Corporation nor the names of its 41 * contributors may be used to endorse or promote products derived 42 * from this software without specific prior written permission. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55 * 56 */ 57 58/* 59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. 60 * 61 *This implementation is based on the previous SSSE3 release: 62 *Visit http://software.intel.com/en-us/articles/ 63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ 64 * 65 *Updates 20-byte SHA-1 record in 'hash' for even number of 66 *'num_blocks' consecutive 64-byte blocks 67 * 68 *extern "C" void sha1_transform_avx2( 69 * int *hash, const char* input, size_t num_blocks ); 70 */ 71 72#include <linux/linkage.h> 73 74#define CTX %rdi /* arg1 */ 75#define BUF %rsi /* arg2 */ 76#define CNT %rdx /* arg3 */ 77 78#define REG_A %ecx 79#define REG_B %esi 80#define REG_C %edi 81#define REG_D %eax 82#define REG_E %edx 83#define REG_TB %ebx 84#define REG_TA %r12d 85#define REG_RA %rcx 86#define REG_RB %rsi 87#define REG_RC %rdi 88#define REG_RD %rax 89#define REG_RE %rdx 90#define REG_RTA %r12 91#define REG_RTB %rbx 92#define REG_T1 %ebp 93#define xmm_mov vmovups 94#define avx2_zeroupper vzeroupper 95#define RND_F1 1 96#define RND_F2 2 97#define RND_F3 3 98 99.macro REGALLOC 100 .set A, REG_A 101 .set B, REG_B 102 .set C, REG_C 103 .set D, REG_D 104 .set E, REG_E 105 .set TB, REG_TB 106 .set TA, REG_TA 107 108 .set RA, REG_RA 109 .set RB, REG_RB 110 .set RC, REG_RC 111 .set RD, REG_RD 112 .set RE, REG_RE 113 114 .set RTA, REG_RTA 115 .set RTB, REG_RTB 116 117 .set T1, REG_T1 118.endm 119 120#define K_BASE %r8 121#define HASH_PTR %r9 122#define BUFFER_PTR %r10 123#define BUFFER_PTR2 %r13 124#define BUFFER_END %r11 125 126#define PRECALC_BUF %r14 127#define WK_BUF %r15 128 129#define W_TMP %xmm0 130#define WY_TMP %ymm0 131#define WY_TMP2 %ymm9 132 133# AVX2 variables 134#define WY0 %ymm3 135#define WY4 %ymm5 136#define WY08 %ymm7 137#define WY12 %ymm8 138#define WY16 %ymm12 139#define WY20 %ymm13 140#define WY24 %ymm14 141#define WY28 %ymm15 142 143#define YMM_SHUFB_BSWAP %ymm10 144 145/* 146 * Keep 2 iterations precalculated at a time: 147 * - 80 DWORDs per iteration * 2 148 */ 149#define W_SIZE (80*2*2 +16) 150 151#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) 152#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) 153 154 155.macro UPDATE_HASH hash, val 156 add \hash, \val 157 mov \val, \hash 158.endm 159 160.macro PRECALC_RESET_WY 161 .set WY_00, WY0 162 .set WY_04, WY4 163 .set WY_08, WY08 164 .set WY_12, WY12 165 .set WY_16, WY16 166 .set WY_20, WY20 167 .set WY_24, WY24 168 .set WY_28, WY28 169 .set WY_32, WY_00 170.endm 171 172.macro PRECALC_ROTATE_WY 173 /* Rotate macros */ 174 .set WY_32, WY_28 175 .set WY_28, WY_24 176 .set WY_24, WY_20 177 .set WY_20, WY_16 178 .set WY_16, WY_12 179 .set WY_12, WY_08 180 .set WY_08, WY_04 181 .set WY_04, WY_00 182 .set WY_00, WY_32 183 184 /* Define register aliases */ 185 .set WY, WY_00 186 .set WY_minus_04, WY_04 187 .set WY_minus_08, WY_08 188 .set WY_minus_12, WY_12 189 .set WY_minus_16, WY_16 190 .set WY_minus_20, WY_20 191 .set WY_minus_24, WY_24 192 .set WY_minus_28, WY_28 193 .set WY_minus_32, WY 194.endm 195 196.macro PRECALC_00_15 197 .if (i == 0) # Initialize and rotate registers 198 PRECALC_RESET_WY 199 PRECALC_ROTATE_WY 200 .endif 201 202 /* message scheduling pre-compute for rounds 0-15 */ 203 .if ((i & 7) == 0) 204 /* 205 * blended AVX2 and ALU instruction scheduling 206 * 1 vector iteration per 8 rounds 207 */ 208 vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP 209 .elseif ((i & 7) == 1) 210 vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ 211 WY_TMP, WY_TMP 212 .elseif ((i & 7) == 2) 213 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY 214 .elseif ((i & 7) == 4) 215 vpaddd K_XMM(K_BASE), WY, WY_TMP 216 .elseif ((i & 7) == 7) 217 vmovdqu WY_TMP, PRECALC_WK(i&~7) 218 219 PRECALC_ROTATE_WY 220 .endif 221.endm 222 223.macro PRECALC_16_31 224 /* 225 * message scheduling pre-compute for rounds 16-31 226 * calculating last 32 w[i] values in 8 XMM registers 227 * pre-calculate K+w[i] values and store to mem 228 * for later load by ALU add instruction 229 * 230 * "brute force" vectorization for rounds 16-31 only 231 * due to w[i]->w[i-3] dependency 232 */ 233 .if ((i & 7) == 0) 234 /* 235 * blended AVX2 and ALU instruction scheduling 236 * 1 vector iteration per 8 rounds 237 */ 238 /* w[i-14] */ 239 vpalignr $8, WY_minus_16, WY_minus_12, WY 240 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ 241 .elseif ((i & 7) == 1) 242 vpxor WY_minus_08, WY, WY 243 vpxor WY_minus_16, WY_TMP, WY_TMP 244 .elseif ((i & 7) == 2) 245 vpxor WY_TMP, WY, WY 246 vpslldq $12, WY, WY_TMP2 247 .elseif ((i & 7) == 3) 248 vpslld $1, WY, WY_TMP 249 vpsrld $31, WY, WY 250 .elseif ((i & 7) == 4) 251 vpor WY, WY_TMP, WY_TMP 252 vpslld $2, WY_TMP2, WY 253 .elseif ((i & 7) == 5) 254 vpsrld $30, WY_TMP2, WY_TMP2 255 vpxor WY, WY_TMP, WY_TMP 256 .elseif ((i & 7) == 7) 257 vpxor WY_TMP2, WY_TMP, WY 258 vpaddd K_XMM(K_BASE), WY, WY_TMP 259 vmovdqu WY_TMP, PRECALC_WK(i&~7) 260 261 PRECALC_ROTATE_WY 262 .endif 263.endm 264 265.macro PRECALC_32_79 266 /* 267 * in SHA-1 specification: 268 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 269 * instead we do equal: 270 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 271 * allows more efficient vectorization 272 * since w[i]=>w[i-3] dependency is broken 273 */ 274 275 .if ((i & 7) == 0) 276 /* 277 * blended AVX2 and ALU instruction scheduling 278 * 1 vector iteration per 8 rounds 279 */ 280 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP 281 .elseif ((i & 7) == 1) 282 /* W is W_minus_32 before xor */ 283 vpxor WY_minus_28, WY, WY 284 .elseif ((i & 7) == 2) 285 vpxor WY_minus_16, WY_TMP, WY_TMP 286 .elseif ((i & 7) == 3) 287 vpxor WY_TMP, WY, WY 288 .elseif ((i & 7) == 4) 289 vpslld $2, WY, WY_TMP 290 .elseif ((i & 7) == 5) 291 vpsrld $30, WY, WY 292 vpor WY, WY_TMP, WY 293 .elseif ((i & 7) == 7) 294 vpaddd K_XMM(K_BASE), WY, WY_TMP 295 vmovdqu WY_TMP, PRECALC_WK(i&~7) 296 297 PRECALC_ROTATE_WY 298 .endif 299.endm 300 301.macro PRECALC r, s 302 .set i, \r 303 304 .if (i < 40) 305 .set K_XMM, 32*0 306 .elseif (i < 80) 307 .set K_XMM, 32*1 308 .elseif (i < 120) 309 .set K_XMM, 32*2 310 .else 311 .set K_XMM, 32*3 312 .endif 313 314 .if (i<32) 315 PRECALC_00_15 \s 316 .elseif (i<64) 317 PRECALC_16_31 \s 318 .elseif (i < 160) 319 PRECALC_32_79 \s 320 .endif 321.endm 322 323.macro ROTATE_STATE 324 .set T_REG, E 325 .set E, D 326 .set D, C 327 .set C, B 328 .set B, TB 329 .set TB, A 330 .set A, T_REG 331 332 .set T_REG, RE 333 .set RE, RD 334 .set RD, RC 335 .set RC, RB 336 .set RB, RTB 337 .set RTB, RA 338 .set RA, T_REG 339.endm 340 341/* Macro relies on saved ROUND_Fx */ 342 343.macro RND_FUN f, r 344 .if (\f == RND_F1) 345 ROUND_F1 \r 346 .elseif (\f == RND_F2) 347 ROUND_F2 \r 348 .elseif (\f == RND_F3) 349 ROUND_F3 \r 350 .endif 351.endm 352 353.macro RR r 354 .set round_id, (\r % 80) 355 356 .if (round_id == 0) /* Precalculate F for first round */ 357 .set ROUND_FUNC, RND_F1 358 mov B, TB 359 360 rorx $(32-30), B, B /* b>>>2 */ 361 andn D, TB, T1 362 and C, TB 363 xor T1, TB 364 .endif 365 366 RND_FUN ROUND_FUNC, \r 367 ROTATE_STATE 368 369 .if (round_id == 18) 370 .set ROUND_FUNC, RND_F2 371 .elseif (round_id == 38) 372 .set ROUND_FUNC, RND_F3 373 .elseif (round_id == 58) 374 .set ROUND_FUNC, RND_F2 375 .endif 376 377 .set round_id, ( (\r+1) % 80) 378 379 RND_FUN ROUND_FUNC, (\r+1) 380 ROTATE_STATE 381.endm 382 383.macro ROUND_F1 r 384 add WK(\r), E 385 386 andn C, A, T1 /* ~b&d */ 387 lea (RE,RTB), E /* Add F from the previous round */ 388 389 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 390 rorx $(32-30),A, TB /* b>>>2 for next round */ 391 392 PRECALC (\r) /* msg scheduling for next 2 blocks */ 393 394 /* 395 * Calculate F for the next round 396 * (b & c) ^ andn[b, d] 397 */ 398 and B, A /* b&c */ 399 xor T1, A /* F1 = (b&c) ^ (~b&d) */ 400 401 lea (RE,RTA), E /* E += A >>> 5 */ 402.endm 403 404.macro ROUND_F2 r 405 add WK(\r), E 406 lea (RE,RTB), E /* Add F from the previous round */ 407 408 /* Calculate F for the next round */ 409 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 410 .if ((round_id) < 79) 411 rorx $(32-30), A, TB /* b>>>2 for next round */ 412 .endif 413 PRECALC (\r) /* msg scheduling for next 2 blocks */ 414 415 .if ((round_id) < 79) 416 xor B, A 417 .endif 418 419 add TA, E /* E += A >>> 5 */ 420 421 .if ((round_id) < 79) 422 xor C, A 423 .endif 424.endm 425 426.macro ROUND_F3 r 427 add WK(\r), E 428 PRECALC (\r) /* msg scheduling for next 2 blocks */ 429 430 lea (RE,RTB), E /* Add F from the previous round */ 431 432 mov B, T1 433 or A, T1 434 435 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 436 rorx $(32-30), A, TB /* b>>>2 for next round */ 437 438 /* Calculate F for the next round 439 * (b and c) or (d and (b or c)) 440 */ 441 and C, T1 442 and B, A 443 or T1, A 444 445 add TA, E /* E += A >>> 5 */ 446 447.endm 448 449/* 450 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining 451 */ 452.macro SHA1_PIPELINED_MAIN_BODY 453 454 REGALLOC 455 456 mov (HASH_PTR), A 457 mov 4(HASH_PTR), B 458 mov 8(HASH_PTR), C 459 mov 12(HASH_PTR), D 460 mov 16(HASH_PTR), E 461 462 mov %rsp, PRECALC_BUF 463 lea (2*4*80+32)(%rsp), WK_BUF 464 465 # Precalc WK for first 2 blocks 466 PRECALC_OFFSET = 0 467 .set i, 0 468 .rept 160 469 PRECALC i 470 .set i, i + 1 471 .endr 472 PRECALC_OFFSET = 128 473 xchg WK_BUF, PRECALC_BUF 474 475 .align 32 476_loop: 477 /* 478 * code loops through more than one block 479 * we use K_BASE value as a signal of a last block, 480 * it is set below by: cmovae BUFFER_PTR, K_BASE 481 */ 482 cmp K_BASE, BUFFER_PTR 483 jne _begin 484 .align 32 485 jmp _end 486 .align 32 487_begin: 488 489 /* 490 * Do first block 491 * rounds: 0,2,4,6,8 492 */ 493 .set j, 0 494 .rept 5 495 RR j 496 .set j, j+2 497 .endr 498 499 jmp _loop0 500_loop0: 501 502 /* 503 * rounds: 504 * 10,12,14,16,18 505 * 20,22,24,26,28 506 * 30,32,34,36,38 507 * 40,42,44,46,48 508 * 50,52,54,56,58 509 */ 510 .rept 25 511 RR j 512 .set j, j+2 513 .endr 514 515 add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ 516 cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ 517 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ 518 519 /* 520 * rounds 521 * 60,62,64,66,68 522 * 70,72,74,76,78 523 */ 524 .rept 10 525 RR j 526 .set j, j+2 527 .endr 528 529 UPDATE_HASH (HASH_PTR), A 530 UPDATE_HASH 4(HASH_PTR), TB 531 UPDATE_HASH 8(HASH_PTR), C 532 UPDATE_HASH 12(HASH_PTR), D 533 UPDATE_HASH 16(HASH_PTR), E 534 535 cmp K_BASE, BUFFER_PTR /* is current block the last one? */ 536 je _loop 537 538 mov TB, B 539 540 /* Process second block */ 541 /* 542 * rounds 543 * 0+80, 2+80, 4+80, 6+80, 8+80 544 * 10+80,12+80,14+80,16+80,18+80 545 */ 546 547 .set j, 0 548 .rept 10 549 RR j+80 550 .set j, j+2 551 .endr 552 553 jmp _loop1 554_loop1: 555 /* 556 * rounds 557 * 20+80,22+80,24+80,26+80,28+80 558 * 30+80,32+80,34+80,36+80,38+80 559 */ 560 .rept 10 561 RR j+80 562 .set j, j+2 563 .endr 564 565 jmp _loop2 566_loop2: 567 568 /* 569 * rounds 570 * 40+80,42+80,44+80,46+80,48+80 571 * 50+80,52+80,54+80,56+80,58+80 572 */ 573 .rept 10 574 RR j+80 575 .set j, j+2 576 .endr 577 578 add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ 579 580 cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ 581 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ 582 583 jmp _loop3 584_loop3: 585 586 /* 587 * rounds 588 * 60+80,62+80,64+80,66+80,68+80 589 * 70+80,72+80,74+80,76+80,78+80 590 */ 591 .rept 10 592 RR j+80 593 .set j, j+2 594 .endr 595 596 UPDATE_HASH (HASH_PTR), A 597 UPDATE_HASH 4(HASH_PTR), TB 598 UPDATE_HASH 8(HASH_PTR), C 599 UPDATE_HASH 12(HASH_PTR), D 600 UPDATE_HASH 16(HASH_PTR), E 601 602 /* Reset state for AVX2 reg permutation */ 603 mov A, TA 604 mov TB, A 605 mov C, TB 606 mov E, C 607 mov D, B 608 mov TA, D 609 610 REGALLOC 611 612 xchg WK_BUF, PRECALC_BUF 613 614 jmp _loop 615 616 .align 32 617 _end: 618 619.endm 620/* 621 * macro implements SHA-1 function's body for several 64-byte blocks 622 * param: function's name 623 */ 624.macro SHA1_VECTOR_ASM name 625 ENTRY(\name) 626 627 push %rbx 628 push %rbp 629 push %r12 630 push %r13 631 push %r14 632 push %r15 633 634 RESERVE_STACK = (W_SIZE*4 + 8+24) 635 636 /* Align stack */ 637 mov %rsp, %rbx 638 and $~(0x20-1), %rsp 639 push %rbx 640 sub $RESERVE_STACK, %rsp 641 642 avx2_zeroupper 643 644 lea K_XMM_AR(%rip), K_BASE 645 646 mov CTX, HASH_PTR 647 mov BUF, BUFFER_PTR 648 lea 64(BUF), BUFFER_PTR2 649 650 shl $6, CNT /* mul by 64 */ 651 add BUF, CNT 652 add $64, CNT 653 mov CNT, BUFFER_END 654 655 cmp BUFFER_END, BUFFER_PTR2 656 cmovae K_BASE, BUFFER_PTR2 657 658 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP 659 660 SHA1_PIPELINED_MAIN_BODY 661 662 avx2_zeroupper 663 664 add $RESERVE_STACK, %rsp 665 pop %rsp 666 667 pop %r15 668 pop %r14 669 pop %r13 670 pop %r12 671 pop %rbp 672 pop %rbx 673 674 ret 675 676 ENDPROC(\name) 677.endm 678 679.section .rodata 680 681#define K1 0x5a827999 682#define K2 0x6ed9eba1 683#define K3 0x8f1bbcdc 684#define K4 0xca62c1d6 685 686.align 128 687K_XMM_AR: 688 .long K1, K1, K1, K1 689 .long K1, K1, K1, K1 690 .long K2, K2, K2, K2 691 .long K2, K2, K2, K2 692 .long K3, K3, K3, K3 693 .long K3, K3, K3, K3 694 .long K4, K4, K4, K4 695 .long K4, K4, K4, K4 696 697BSWAP_SHUFB_CTL: 698 .long 0x00010203 699 .long 0x04050607 700 .long 0x08090a0b 701 .long 0x0c0d0e0f 702 .long 0x00010203 703 .long 0x04050607 704 .long 0x08090a0b 705 .long 0x0c0d0e0f 706.text 707 708SHA1_VECTOR_ASM sha1_transform_avx2 709