17c1da8d0Schandramouli narayanan/* 27c1da8d0Schandramouli narayanan * Implement fast SHA-1 with AVX2 instructions. (x86_64) 37c1da8d0Schandramouli narayanan * 47c1da8d0Schandramouli narayanan * This file is provided under a dual BSD/GPLv2 license. When using or 57c1da8d0Schandramouli narayanan * redistributing this file, you may do so under either license. 67c1da8d0Schandramouli narayanan * 77c1da8d0Schandramouli narayanan * GPL LICENSE SUMMARY 87c1da8d0Schandramouli narayanan * 97c1da8d0Schandramouli narayanan * Copyright(c) 2014 Intel Corporation. 107c1da8d0Schandramouli narayanan * 117c1da8d0Schandramouli narayanan * This program is free software; you can redistribute it and/or modify 127c1da8d0Schandramouli narayanan * it under the terms of version 2 of the GNU General Public License as 137c1da8d0Schandramouli narayanan * published by the Free Software Foundation. 147c1da8d0Schandramouli narayanan * 157c1da8d0Schandramouli narayanan * This program is distributed in the hope that it will be useful, but 167c1da8d0Schandramouli narayanan * WITHOUT ANY WARRANTY; without even the implied warranty of 177c1da8d0Schandramouli narayanan * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 187c1da8d0Schandramouli narayanan * General Public License for more details. 197c1da8d0Schandramouli narayanan * 207c1da8d0Schandramouli narayanan * Contact Information: 217c1da8d0Schandramouli narayanan * Ilya Albrekht <ilya.albrekht@intel.com> 227c1da8d0Schandramouli narayanan * Maxim Locktyukhin <maxim.locktyukhin@intel.com> 237c1da8d0Schandramouli narayanan * Ronen Zohar <ronen.zohar@intel.com> 247c1da8d0Schandramouli narayanan * Chandramouli Narayanan <mouli@linux.intel.com> 257c1da8d0Schandramouli narayanan * 267c1da8d0Schandramouli narayanan * BSD LICENSE 277c1da8d0Schandramouli narayanan * 287c1da8d0Schandramouli narayanan * Copyright(c) 2014 Intel Corporation. 297c1da8d0Schandramouli narayanan * 307c1da8d0Schandramouli narayanan * Redistribution and use in source and binary forms, with or without 317c1da8d0Schandramouli narayanan * modification, are permitted provided that the following conditions 327c1da8d0Schandramouli narayanan * are met: 337c1da8d0Schandramouli narayanan * 347c1da8d0Schandramouli narayanan * Redistributions of source code must retain the above copyright 357c1da8d0Schandramouli narayanan * notice, this list of conditions and the following disclaimer. 367c1da8d0Schandramouli narayanan * Redistributions in binary form must reproduce the above copyright 377c1da8d0Schandramouli narayanan * notice, this list of conditions and the following disclaimer in 387c1da8d0Schandramouli narayanan * the documentation and/or other materials provided with the 397c1da8d0Schandramouli narayanan * distribution. 407c1da8d0Schandramouli narayanan * Neither the name of Intel Corporation nor the names of its 417c1da8d0Schandramouli narayanan * contributors may be used to endorse or promote products derived 427c1da8d0Schandramouli narayanan * from this software without specific prior written permission. 437c1da8d0Schandramouli narayanan * 447c1da8d0Schandramouli narayanan * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 457c1da8d0Schandramouli narayanan * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 467c1da8d0Schandramouli narayanan * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 477c1da8d0Schandramouli narayanan * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 487c1da8d0Schandramouli narayanan * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 497c1da8d0Schandramouli narayanan * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 507c1da8d0Schandramouli narayanan * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 517c1da8d0Schandramouli narayanan * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 527c1da8d0Schandramouli narayanan * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 537c1da8d0Schandramouli narayanan * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 547c1da8d0Schandramouli narayanan * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 557c1da8d0Schandramouli narayanan * 567c1da8d0Schandramouli narayanan */ 577c1da8d0Schandramouli narayanan 587c1da8d0Schandramouli narayanan/* 597c1da8d0Schandramouli narayanan * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. 607c1da8d0Schandramouli narayanan * 617c1da8d0Schandramouli narayanan *This implementation is based on the previous SSSE3 release: 627c1da8d0Schandramouli narayanan *Visit http://software.intel.com/en-us/articles/ 637c1da8d0Schandramouli narayanan *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ 647c1da8d0Schandramouli narayanan * 6541419a28SKees Cook *Updates 20-byte SHA-1 record at start of 'state', from 'input', for 6641419a28SKees Cook *even number of 'blocks' consecutive 64-byte blocks. 677c1da8d0Schandramouli narayanan * 687c1da8d0Schandramouli narayanan *extern "C" void sha1_transform_avx2( 6941419a28SKees Cook * struct sha1_state *state, const u8* input, int blocks ); 707c1da8d0Schandramouli narayanan */ 717c1da8d0Schandramouli narayanan 727c1da8d0Schandramouli narayanan#include <linux/linkage.h> 737c1da8d0Schandramouli narayanan 747c1da8d0Schandramouli narayanan#define CTX %rdi /* arg1 */ 757c1da8d0Schandramouli narayanan#define BUF %rsi /* arg2 */ 767c1da8d0Schandramouli narayanan#define CNT %rdx /* arg3 */ 777c1da8d0Schandramouli narayanan 787c1da8d0Schandramouli narayanan#define REG_A %ecx 797c1da8d0Schandramouli narayanan#define REG_B %esi 807c1da8d0Schandramouli narayanan#define REG_C %edi 817c1da8d0Schandramouli narayanan#define REG_D %eax 827c1da8d0Schandramouli narayanan#define REG_E %edx 837c1da8d0Schandramouli narayanan#define REG_TB %ebx 847c1da8d0Schandramouli narayanan#define REG_TA %r12d 857c1da8d0Schandramouli narayanan#define REG_RA %rcx 867c1da8d0Schandramouli narayanan#define REG_RB %rsi 877c1da8d0Schandramouli narayanan#define REG_RC %rdi 887c1da8d0Schandramouli narayanan#define REG_RD %rax 897c1da8d0Schandramouli narayanan#define REG_RE %rdx 907c1da8d0Schandramouli narayanan#define REG_RTA %r12 917c1da8d0Schandramouli narayanan#define REG_RTB %rbx 92d7b1722cSJosh Poimboeuf#define REG_T1 %r11d 937c1da8d0Schandramouli narayanan#define xmm_mov vmovups 947c1da8d0Schandramouli narayanan#define avx2_zeroupper vzeroupper 957c1da8d0Schandramouli narayanan#define RND_F1 1 967c1da8d0Schandramouli narayanan#define RND_F2 2 977c1da8d0Schandramouli narayanan#define RND_F3 3 987c1da8d0Schandramouli narayanan 997c1da8d0Schandramouli narayanan.macro REGALLOC 1007c1da8d0Schandramouli narayanan .set A, REG_A 1017c1da8d0Schandramouli narayanan .set B, REG_B 1027c1da8d0Schandramouli narayanan .set C, REG_C 1037c1da8d0Schandramouli narayanan .set D, REG_D 1047c1da8d0Schandramouli narayanan .set E, REG_E 1057c1da8d0Schandramouli narayanan .set TB, REG_TB 1067c1da8d0Schandramouli narayanan .set TA, REG_TA 1077c1da8d0Schandramouli narayanan 1087c1da8d0Schandramouli narayanan .set RA, REG_RA 1097c1da8d0Schandramouli narayanan .set RB, REG_RB 1107c1da8d0Schandramouli narayanan .set RC, REG_RC 1117c1da8d0Schandramouli narayanan .set RD, REG_RD 1127c1da8d0Schandramouli narayanan .set RE, REG_RE 1137c1da8d0Schandramouli narayanan 1147c1da8d0Schandramouli narayanan .set RTA, REG_RTA 1157c1da8d0Schandramouli narayanan .set RTB, REG_RTB 1167c1da8d0Schandramouli narayanan 1177c1da8d0Schandramouli narayanan .set T1, REG_T1 1187c1da8d0Schandramouli narayanan.endm 1197c1da8d0Schandramouli narayanan 1207c1da8d0Schandramouli narayanan#define HASH_PTR %r9 1218861249cSmegha.dey@linux.intel.com#define BLOCKS_CTR %r8 1227c1da8d0Schandramouli narayanan#define BUFFER_PTR %r10 1237c1da8d0Schandramouli narayanan#define BUFFER_PTR2 %r13 1247c1da8d0Schandramouli narayanan 1257c1da8d0Schandramouli narayanan#define PRECALC_BUF %r14 1267c1da8d0Schandramouli narayanan#define WK_BUF %r15 1277c1da8d0Schandramouli narayanan 1287c1da8d0Schandramouli narayanan#define W_TMP %xmm0 1297c1da8d0Schandramouli narayanan#define WY_TMP %ymm0 1307c1da8d0Schandramouli narayanan#define WY_TMP2 %ymm9 1317c1da8d0Schandramouli narayanan 1327c1da8d0Schandramouli narayanan# AVX2 variables 1337c1da8d0Schandramouli narayanan#define WY0 %ymm3 1347c1da8d0Schandramouli narayanan#define WY4 %ymm5 1357c1da8d0Schandramouli narayanan#define WY08 %ymm7 1367c1da8d0Schandramouli narayanan#define WY12 %ymm8 1377c1da8d0Schandramouli narayanan#define WY16 %ymm12 1387c1da8d0Schandramouli narayanan#define WY20 %ymm13 1397c1da8d0Schandramouli narayanan#define WY24 %ymm14 1407c1da8d0Schandramouli narayanan#define WY28 %ymm15 1417c1da8d0Schandramouli narayanan 1427c1da8d0Schandramouli narayanan#define YMM_SHUFB_BSWAP %ymm10 1437c1da8d0Schandramouli narayanan 1447c1da8d0Schandramouli narayanan/* 1457c1da8d0Schandramouli narayanan * Keep 2 iterations precalculated at a time: 1467c1da8d0Schandramouli narayanan * - 80 DWORDs per iteration * 2 1477c1da8d0Schandramouli narayanan */ 1487c1da8d0Schandramouli narayanan#define W_SIZE (80*2*2 +16) 1497c1da8d0Schandramouli narayanan 1507c1da8d0Schandramouli narayanan#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) 1517c1da8d0Schandramouli narayanan#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) 1527c1da8d0Schandramouli narayanan 1537c1da8d0Schandramouli narayanan 1547c1da8d0Schandramouli narayanan.macro UPDATE_HASH hash, val 1557c1da8d0Schandramouli narayanan add \hash, \val 1567c1da8d0Schandramouli narayanan mov \val, \hash 1577c1da8d0Schandramouli narayanan.endm 1587c1da8d0Schandramouli narayanan 1597c1da8d0Schandramouli narayanan.macro PRECALC_RESET_WY 1607c1da8d0Schandramouli narayanan .set WY_00, WY0 1617c1da8d0Schandramouli narayanan .set WY_04, WY4 1627c1da8d0Schandramouli narayanan .set WY_08, WY08 1637c1da8d0Schandramouli narayanan .set WY_12, WY12 1647c1da8d0Schandramouli narayanan .set WY_16, WY16 1657c1da8d0Schandramouli narayanan .set WY_20, WY20 1667c1da8d0Schandramouli narayanan .set WY_24, WY24 1677c1da8d0Schandramouli narayanan .set WY_28, WY28 1687c1da8d0Schandramouli narayanan .set WY_32, WY_00 1697c1da8d0Schandramouli narayanan.endm 1707c1da8d0Schandramouli narayanan 1717c1da8d0Schandramouli narayanan.macro PRECALC_ROTATE_WY 1727c1da8d0Schandramouli narayanan /* Rotate macros */ 1737c1da8d0Schandramouli narayanan .set WY_32, WY_28 1747c1da8d0Schandramouli narayanan .set WY_28, WY_24 1757c1da8d0Schandramouli narayanan .set WY_24, WY_20 1767c1da8d0Schandramouli narayanan .set WY_20, WY_16 1777c1da8d0Schandramouli narayanan .set WY_16, WY_12 1787c1da8d0Schandramouli narayanan .set WY_12, WY_08 1797c1da8d0Schandramouli narayanan .set WY_08, WY_04 1807c1da8d0Schandramouli narayanan .set WY_04, WY_00 1817c1da8d0Schandramouli narayanan .set WY_00, WY_32 1827c1da8d0Schandramouli narayanan 1837c1da8d0Schandramouli narayanan /* Define register aliases */ 1847c1da8d0Schandramouli narayanan .set WY, WY_00 1857c1da8d0Schandramouli narayanan .set WY_minus_04, WY_04 1867c1da8d0Schandramouli narayanan .set WY_minus_08, WY_08 1877c1da8d0Schandramouli narayanan .set WY_minus_12, WY_12 1887c1da8d0Schandramouli narayanan .set WY_minus_16, WY_16 1897c1da8d0Schandramouli narayanan .set WY_minus_20, WY_20 1907c1da8d0Schandramouli narayanan .set WY_minus_24, WY_24 1917c1da8d0Schandramouli narayanan .set WY_minus_28, WY_28 1927c1da8d0Schandramouli narayanan .set WY_minus_32, WY 1937c1da8d0Schandramouli narayanan.endm 1947c1da8d0Schandramouli narayanan 1957c1da8d0Schandramouli narayanan.macro PRECALC_00_15 1967c1da8d0Schandramouli narayanan .if (i == 0) # Initialize and rotate registers 1977c1da8d0Schandramouli narayanan PRECALC_RESET_WY 1987c1da8d0Schandramouli narayanan PRECALC_ROTATE_WY 1997c1da8d0Schandramouli narayanan .endif 2007c1da8d0Schandramouli narayanan 2017c1da8d0Schandramouli narayanan /* message scheduling pre-compute for rounds 0-15 */ 2027c1da8d0Schandramouli narayanan .if ((i & 7) == 0) 2037c1da8d0Schandramouli narayanan /* 2047c1da8d0Schandramouli narayanan * blended AVX2 and ALU instruction scheduling 2057c1da8d0Schandramouli narayanan * 1 vector iteration per 8 rounds 2067c1da8d0Schandramouli narayanan */ 2078861249cSmegha.dey@linux.intel.com vmovdqu (i * 2)(BUFFER_PTR), W_TMP 2087c1da8d0Schandramouli narayanan .elseif ((i & 7) == 1) 2098861249cSmegha.dey@linux.intel.com vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ 2107c1da8d0Schandramouli narayanan WY_TMP, WY_TMP 2117c1da8d0Schandramouli narayanan .elseif ((i & 7) == 2) 2127c1da8d0Schandramouli narayanan vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY 2137c1da8d0Schandramouli narayanan .elseif ((i & 7) == 4) 2148861249cSmegha.dey@linux.intel.com vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 2157c1da8d0Schandramouli narayanan .elseif ((i & 7) == 7) 2167c1da8d0Schandramouli narayanan vmovdqu WY_TMP, PRECALC_WK(i&~7) 2177c1da8d0Schandramouli narayanan 2187c1da8d0Schandramouli narayanan PRECALC_ROTATE_WY 2197c1da8d0Schandramouli narayanan .endif 2207c1da8d0Schandramouli narayanan.endm 2217c1da8d0Schandramouli narayanan 2227c1da8d0Schandramouli narayanan.macro PRECALC_16_31 2237c1da8d0Schandramouli narayanan /* 2247c1da8d0Schandramouli narayanan * message scheduling pre-compute for rounds 16-31 2257c1da8d0Schandramouli narayanan * calculating last 32 w[i] values in 8 XMM registers 2267c1da8d0Schandramouli narayanan * pre-calculate K+w[i] values and store to mem 2277c1da8d0Schandramouli narayanan * for later load by ALU add instruction 2287c1da8d0Schandramouli narayanan * 2297c1da8d0Schandramouli narayanan * "brute force" vectorization for rounds 16-31 only 2307c1da8d0Schandramouli narayanan * due to w[i]->w[i-3] dependency 2317c1da8d0Schandramouli narayanan */ 2327c1da8d0Schandramouli narayanan .if ((i & 7) == 0) 2337c1da8d0Schandramouli narayanan /* 2347c1da8d0Schandramouli narayanan * blended AVX2 and ALU instruction scheduling 2357c1da8d0Schandramouli narayanan * 1 vector iteration per 8 rounds 2367c1da8d0Schandramouli narayanan */ 2377c1da8d0Schandramouli narayanan /* w[i-14] */ 2387c1da8d0Schandramouli narayanan vpalignr $8, WY_minus_16, WY_minus_12, WY 2397c1da8d0Schandramouli narayanan vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ 2407c1da8d0Schandramouli narayanan .elseif ((i & 7) == 1) 2417c1da8d0Schandramouli narayanan vpxor WY_minus_08, WY, WY 2427c1da8d0Schandramouli narayanan vpxor WY_minus_16, WY_TMP, WY_TMP 2437c1da8d0Schandramouli narayanan .elseif ((i & 7) == 2) 2447c1da8d0Schandramouli narayanan vpxor WY_TMP, WY, WY 2457c1da8d0Schandramouli narayanan vpslldq $12, WY, WY_TMP2 2467c1da8d0Schandramouli narayanan .elseif ((i & 7) == 3) 2477c1da8d0Schandramouli narayanan vpslld $1, WY, WY_TMP 2487c1da8d0Schandramouli narayanan vpsrld $31, WY, WY 2497c1da8d0Schandramouli narayanan .elseif ((i & 7) == 4) 2507c1da8d0Schandramouli narayanan vpor WY, WY_TMP, WY_TMP 2517c1da8d0Schandramouli narayanan vpslld $2, WY_TMP2, WY 2527c1da8d0Schandramouli narayanan .elseif ((i & 7) == 5) 2537c1da8d0Schandramouli narayanan vpsrld $30, WY_TMP2, WY_TMP2 2547c1da8d0Schandramouli narayanan vpxor WY, WY_TMP, WY_TMP 2557c1da8d0Schandramouli narayanan .elseif ((i & 7) == 7) 2567c1da8d0Schandramouli narayanan vpxor WY_TMP2, WY_TMP, WY 2578861249cSmegha.dey@linux.intel.com vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 2587c1da8d0Schandramouli narayanan vmovdqu WY_TMP, PRECALC_WK(i&~7) 2597c1da8d0Schandramouli narayanan 2607c1da8d0Schandramouli narayanan PRECALC_ROTATE_WY 2617c1da8d0Schandramouli narayanan .endif 2627c1da8d0Schandramouli narayanan.endm 2637c1da8d0Schandramouli narayanan 2647c1da8d0Schandramouli narayanan.macro PRECALC_32_79 2657c1da8d0Schandramouli narayanan /* 2667c1da8d0Schandramouli narayanan * in SHA-1 specification: 2677c1da8d0Schandramouli narayanan * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 2687c1da8d0Schandramouli narayanan * instead we do equal: 2697c1da8d0Schandramouli narayanan * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 2707c1da8d0Schandramouli narayanan * allows more efficient vectorization 2717c1da8d0Schandramouli narayanan * since w[i]=>w[i-3] dependency is broken 2727c1da8d0Schandramouli narayanan */ 2737c1da8d0Schandramouli narayanan 2747c1da8d0Schandramouli narayanan .if ((i & 7) == 0) 2757c1da8d0Schandramouli narayanan /* 2767c1da8d0Schandramouli narayanan * blended AVX2 and ALU instruction scheduling 2777c1da8d0Schandramouli narayanan * 1 vector iteration per 8 rounds 2787c1da8d0Schandramouli narayanan */ 2797c1da8d0Schandramouli narayanan vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP 2807c1da8d0Schandramouli narayanan .elseif ((i & 7) == 1) 2817c1da8d0Schandramouli narayanan /* W is W_minus_32 before xor */ 2827c1da8d0Schandramouli narayanan vpxor WY_minus_28, WY, WY 2837c1da8d0Schandramouli narayanan .elseif ((i & 7) == 2) 2847c1da8d0Schandramouli narayanan vpxor WY_minus_16, WY_TMP, WY_TMP 2857c1da8d0Schandramouli narayanan .elseif ((i & 7) == 3) 2867c1da8d0Schandramouli narayanan vpxor WY_TMP, WY, WY 2877c1da8d0Schandramouli narayanan .elseif ((i & 7) == 4) 2887c1da8d0Schandramouli narayanan vpslld $2, WY, WY_TMP 2897c1da8d0Schandramouli narayanan .elseif ((i & 7) == 5) 2907c1da8d0Schandramouli narayanan vpsrld $30, WY, WY 2917c1da8d0Schandramouli narayanan vpor WY, WY_TMP, WY 2927c1da8d0Schandramouli narayanan .elseif ((i & 7) == 7) 2938861249cSmegha.dey@linux.intel.com vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 2947c1da8d0Schandramouli narayanan vmovdqu WY_TMP, PRECALC_WK(i&~7) 2957c1da8d0Schandramouli narayanan 2967c1da8d0Schandramouli narayanan PRECALC_ROTATE_WY 2977c1da8d0Schandramouli narayanan .endif 2987c1da8d0Schandramouli narayanan.endm 2997c1da8d0Schandramouli narayanan 3007c1da8d0Schandramouli narayanan.macro PRECALC r, s 3017c1da8d0Schandramouli narayanan .set i, \r 3027c1da8d0Schandramouli narayanan 3037c1da8d0Schandramouli narayanan .if (i < 40) 3047c1da8d0Schandramouli narayanan .set K_XMM, 32*0 3057c1da8d0Schandramouli narayanan .elseif (i < 80) 3067c1da8d0Schandramouli narayanan .set K_XMM, 32*1 3077c1da8d0Schandramouli narayanan .elseif (i < 120) 3087c1da8d0Schandramouli narayanan .set K_XMM, 32*2 3097c1da8d0Schandramouli narayanan .else 3107c1da8d0Schandramouli narayanan .set K_XMM, 32*3 3117c1da8d0Schandramouli narayanan .endif 3127c1da8d0Schandramouli narayanan 3137c1da8d0Schandramouli narayanan .if (i<32) 3147c1da8d0Schandramouli narayanan PRECALC_00_15 \s 3157c1da8d0Schandramouli narayanan .elseif (i<64) 3167c1da8d0Schandramouli narayanan PRECALC_16_31 \s 3177c1da8d0Schandramouli narayanan .elseif (i < 160) 3187c1da8d0Schandramouli narayanan PRECALC_32_79 \s 3197c1da8d0Schandramouli narayanan .endif 3207c1da8d0Schandramouli narayanan.endm 3217c1da8d0Schandramouli narayanan 3227c1da8d0Schandramouli narayanan.macro ROTATE_STATE 3237c1da8d0Schandramouli narayanan .set T_REG, E 3247c1da8d0Schandramouli narayanan .set E, D 3257c1da8d0Schandramouli narayanan .set D, C 3267c1da8d0Schandramouli narayanan .set C, B 3277c1da8d0Schandramouli narayanan .set B, TB 3287c1da8d0Schandramouli narayanan .set TB, A 3297c1da8d0Schandramouli narayanan .set A, T_REG 3307c1da8d0Schandramouli narayanan 3317c1da8d0Schandramouli narayanan .set T_REG, RE 3327c1da8d0Schandramouli narayanan .set RE, RD 3337c1da8d0Schandramouli narayanan .set RD, RC 3347c1da8d0Schandramouli narayanan .set RC, RB 3357c1da8d0Schandramouli narayanan .set RB, RTB 3367c1da8d0Schandramouli narayanan .set RTB, RA 3377c1da8d0Schandramouli narayanan .set RA, T_REG 3387c1da8d0Schandramouli narayanan.endm 3397c1da8d0Schandramouli narayanan 3407c1da8d0Schandramouli narayanan/* Macro relies on saved ROUND_Fx */ 3417c1da8d0Schandramouli narayanan 3427c1da8d0Schandramouli narayanan.macro RND_FUN f, r 3437c1da8d0Schandramouli narayanan .if (\f == RND_F1) 3447c1da8d0Schandramouli narayanan ROUND_F1 \r 3457c1da8d0Schandramouli narayanan .elseif (\f == RND_F2) 3467c1da8d0Schandramouli narayanan ROUND_F2 \r 3477c1da8d0Schandramouli narayanan .elseif (\f == RND_F3) 3487c1da8d0Schandramouli narayanan ROUND_F3 \r 3497c1da8d0Schandramouli narayanan .endif 3507c1da8d0Schandramouli narayanan.endm 3517c1da8d0Schandramouli narayanan 3527c1da8d0Schandramouli narayanan.macro RR r 3537c1da8d0Schandramouli narayanan .set round_id, (\r % 80) 3547c1da8d0Schandramouli narayanan 3557c1da8d0Schandramouli narayanan .if (round_id == 0) /* Precalculate F for first round */ 3567c1da8d0Schandramouli narayanan .set ROUND_FUNC, RND_F1 3577c1da8d0Schandramouli narayanan mov B, TB 3587c1da8d0Schandramouli narayanan 3597c1da8d0Schandramouli narayanan rorx $(32-30), B, B /* b>>>2 */ 3607c1da8d0Schandramouli narayanan andn D, TB, T1 3617c1da8d0Schandramouli narayanan and C, TB 3627c1da8d0Schandramouli narayanan xor T1, TB 3637c1da8d0Schandramouli narayanan .endif 3647c1da8d0Schandramouli narayanan 3657c1da8d0Schandramouli narayanan RND_FUN ROUND_FUNC, \r 3667c1da8d0Schandramouli narayanan ROTATE_STATE 3677c1da8d0Schandramouli narayanan 3687c1da8d0Schandramouli narayanan .if (round_id == 18) 3697c1da8d0Schandramouli narayanan .set ROUND_FUNC, RND_F2 3707c1da8d0Schandramouli narayanan .elseif (round_id == 38) 3717c1da8d0Schandramouli narayanan .set ROUND_FUNC, RND_F3 3727c1da8d0Schandramouli narayanan .elseif (round_id == 58) 3737c1da8d0Schandramouli narayanan .set ROUND_FUNC, RND_F2 3747c1da8d0Schandramouli narayanan .endif 3757c1da8d0Schandramouli narayanan 3767c1da8d0Schandramouli narayanan .set round_id, ( (\r+1) % 80) 3777c1da8d0Schandramouli narayanan 3787c1da8d0Schandramouli narayanan RND_FUN ROUND_FUNC, (\r+1) 3797c1da8d0Schandramouli narayanan ROTATE_STATE 3807c1da8d0Schandramouli narayanan.endm 3817c1da8d0Schandramouli narayanan 3827c1da8d0Schandramouli narayanan.macro ROUND_F1 r 3837c1da8d0Schandramouli narayanan add WK(\r), E 3847c1da8d0Schandramouli narayanan 3857c1da8d0Schandramouli narayanan andn C, A, T1 /* ~b&d */ 3867c1da8d0Schandramouli narayanan lea (RE,RTB), E /* Add F from the previous round */ 3877c1da8d0Schandramouli narayanan 3887c1da8d0Schandramouli narayanan rorx $(32-5), A, TA /* T2 = A >>> 5 */ 3897c1da8d0Schandramouli narayanan rorx $(32-30),A, TB /* b>>>2 for next round */ 3907c1da8d0Schandramouli narayanan 3917c1da8d0Schandramouli narayanan PRECALC (\r) /* msg scheduling for next 2 blocks */ 3927c1da8d0Schandramouli narayanan 3937c1da8d0Schandramouli narayanan /* 3947c1da8d0Schandramouli narayanan * Calculate F for the next round 3957c1da8d0Schandramouli narayanan * (b & c) ^ andn[b, d] 3967c1da8d0Schandramouli narayanan */ 3977c1da8d0Schandramouli narayanan and B, A /* b&c */ 3987c1da8d0Schandramouli narayanan xor T1, A /* F1 = (b&c) ^ (~b&d) */ 3997c1da8d0Schandramouli narayanan 4007c1da8d0Schandramouli narayanan lea (RE,RTA), E /* E += A >>> 5 */ 4017c1da8d0Schandramouli narayanan.endm 4027c1da8d0Schandramouli narayanan 4037c1da8d0Schandramouli narayanan.macro ROUND_F2 r 4047c1da8d0Schandramouli narayanan add WK(\r), E 4057c1da8d0Schandramouli narayanan lea (RE,RTB), E /* Add F from the previous round */ 4067c1da8d0Schandramouli narayanan 4077c1da8d0Schandramouli narayanan /* Calculate F for the next round */ 4087c1da8d0Schandramouli narayanan rorx $(32-5), A, TA /* T2 = A >>> 5 */ 4097c1da8d0Schandramouli narayanan .if ((round_id) < 79) 4107c1da8d0Schandramouli narayanan rorx $(32-30), A, TB /* b>>>2 for next round */ 4117c1da8d0Schandramouli narayanan .endif 4127c1da8d0Schandramouli narayanan PRECALC (\r) /* msg scheduling for next 2 blocks */ 4137c1da8d0Schandramouli narayanan 4147c1da8d0Schandramouli narayanan .if ((round_id) < 79) 4157c1da8d0Schandramouli narayanan xor B, A 4167c1da8d0Schandramouli narayanan .endif 4177c1da8d0Schandramouli narayanan 4187c1da8d0Schandramouli narayanan add TA, E /* E += A >>> 5 */ 4197c1da8d0Schandramouli narayanan 4207c1da8d0Schandramouli narayanan .if ((round_id) < 79) 4217c1da8d0Schandramouli narayanan xor C, A 4227c1da8d0Schandramouli narayanan .endif 4237c1da8d0Schandramouli narayanan.endm 4247c1da8d0Schandramouli narayanan 4257c1da8d0Schandramouli narayanan.macro ROUND_F3 r 4267c1da8d0Schandramouli narayanan add WK(\r), E 4277c1da8d0Schandramouli narayanan PRECALC (\r) /* msg scheduling for next 2 blocks */ 4287c1da8d0Schandramouli narayanan 4297c1da8d0Schandramouli narayanan lea (RE,RTB), E /* Add F from the previous round */ 4307c1da8d0Schandramouli narayanan 4317c1da8d0Schandramouli narayanan mov B, T1 4327c1da8d0Schandramouli narayanan or A, T1 4337c1da8d0Schandramouli narayanan 4347c1da8d0Schandramouli narayanan rorx $(32-5), A, TA /* T2 = A >>> 5 */ 4357c1da8d0Schandramouli narayanan rorx $(32-30), A, TB /* b>>>2 for next round */ 4367c1da8d0Schandramouli narayanan 4377c1da8d0Schandramouli narayanan /* Calculate F for the next round 4387c1da8d0Schandramouli narayanan * (b and c) or (d and (b or c)) 4397c1da8d0Schandramouli narayanan */ 4407c1da8d0Schandramouli narayanan and C, T1 4417c1da8d0Schandramouli narayanan and B, A 4427c1da8d0Schandramouli narayanan or T1, A 4437c1da8d0Schandramouli narayanan 4447c1da8d0Schandramouli narayanan add TA, E /* E += A >>> 5 */ 4457c1da8d0Schandramouli narayanan 4467c1da8d0Schandramouli narayanan.endm 4477c1da8d0Schandramouli narayanan 4488861249cSmegha.dey@linux.intel.com/* Add constant only if (%2 > %3) condition met (uses RTA as temp) 4498861249cSmegha.dey@linux.intel.com * %1 + %2 >= %3 ? %4 : 0 4508861249cSmegha.dey@linux.intel.com */ 4518861249cSmegha.dey@linux.intel.com.macro ADD_IF_GE a, b, c, d 4528861249cSmegha.dey@linux.intel.com mov \a, RTA 4538861249cSmegha.dey@linux.intel.com add $\d, RTA 4548861249cSmegha.dey@linux.intel.com cmp $\c, \b 4558861249cSmegha.dey@linux.intel.com cmovge RTA, \a 4568861249cSmegha.dey@linux.intel.com.endm 4578861249cSmegha.dey@linux.intel.com 4587c1da8d0Schandramouli narayanan/* 4597c1da8d0Schandramouli narayanan * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining 4607c1da8d0Schandramouli narayanan */ 4617c1da8d0Schandramouli narayanan.macro SHA1_PIPELINED_MAIN_BODY 4627c1da8d0Schandramouli narayanan 4637c1da8d0Schandramouli narayanan REGALLOC 4647c1da8d0Schandramouli narayanan 4657c1da8d0Schandramouli narayanan mov (HASH_PTR), A 4667c1da8d0Schandramouli narayanan mov 4(HASH_PTR), B 4677c1da8d0Schandramouli narayanan mov 8(HASH_PTR), C 4687c1da8d0Schandramouli narayanan mov 12(HASH_PTR), D 4697c1da8d0Schandramouli narayanan mov 16(HASH_PTR), E 4707c1da8d0Schandramouli narayanan 4717c1da8d0Schandramouli narayanan mov %rsp, PRECALC_BUF 4727c1da8d0Schandramouli narayanan lea (2*4*80+32)(%rsp), WK_BUF 4737c1da8d0Schandramouli narayanan 4747c1da8d0Schandramouli narayanan # Precalc WK for first 2 blocks 4758861249cSmegha.dey@linux.intel.com ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 4767c1da8d0Schandramouli narayanan .set i, 0 4777c1da8d0Schandramouli narayanan .rept 160 4787c1da8d0Schandramouli narayanan PRECALC i 4797c1da8d0Schandramouli narayanan .set i, i + 1 4807c1da8d0Schandramouli narayanan .endr 4818861249cSmegha.dey@linux.intel.com 4828861249cSmegha.dey@linux.intel.com /* Go to next block if needed */ 4838861249cSmegha.dey@linux.intel.com ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 4848861249cSmegha.dey@linux.intel.com ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 4857c1da8d0Schandramouli narayanan xchg WK_BUF, PRECALC_BUF 4867c1da8d0Schandramouli narayanan 4877c1da8d0Schandramouli narayanan .align 32 488*94330fbeSArd Biesheuvel.L_loop: 4897c1da8d0Schandramouli narayanan /* 4907c1da8d0Schandramouli narayanan * code loops through more than one block 4917c1da8d0Schandramouli narayanan * we use K_BASE value as a signal of a last block, 4927c1da8d0Schandramouli narayanan * it is set below by: cmovae BUFFER_PTR, K_BASE 4937c1da8d0Schandramouli narayanan */ 4948861249cSmegha.dey@linux.intel.com test BLOCKS_CTR, BLOCKS_CTR 495*94330fbeSArd Biesheuvel jnz .L_begin 4967c1da8d0Schandramouli narayanan .align 32 497*94330fbeSArd Biesheuvel jmp .L_end 4987c1da8d0Schandramouli narayanan .align 32 499*94330fbeSArd Biesheuvel.L_begin: 5007c1da8d0Schandramouli narayanan 5017c1da8d0Schandramouli narayanan /* 5027c1da8d0Schandramouli narayanan * Do first block 5037c1da8d0Schandramouli narayanan * rounds: 0,2,4,6,8 5047c1da8d0Schandramouli narayanan */ 5057c1da8d0Schandramouli narayanan .set j, 0 5067c1da8d0Schandramouli narayanan .rept 5 5077c1da8d0Schandramouli narayanan RR j 5087c1da8d0Schandramouli narayanan .set j, j+2 5097c1da8d0Schandramouli narayanan .endr 5107c1da8d0Schandramouli narayanan 5117c1da8d0Schandramouli narayanan /* 5127c1da8d0Schandramouli narayanan * rounds: 5137c1da8d0Schandramouli narayanan * 10,12,14,16,18 5147c1da8d0Schandramouli narayanan * 20,22,24,26,28 5157c1da8d0Schandramouli narayanan * 30,32,34,36,38 5167c1da8d0Schandramouli narayanan * 40,42,44,46,48 5177c1da8d0Schandramouli narayanan * 50,52,54,56,58 5187c1da8d0Schandramouli narayanan */ 5197c1da8d0Schandramouli narayanan .rept 25 5207c1da8d0Schandramouli narayanan RR j 5217c1da8d0Schandramouli narayanan .set j, j+2 5227c1da8d0Schandramouli narayanan .endr 5237c1da8d0Schandramouli narayanan 5248861249cSmegha.dey@linux.intel.com /* Update Counter */ 5258861249cSmegha.dey@linux.intel.com sub $1, BLOCKS_CTR 5268861249cSmegha.dey@linux.intel.com /* Move to the next block only if needed*/ 5278861249cSmegha.dey@linux.intel.com ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 5287c1da8d0Schandramouli narayanan /* 5297c1da8d0Schandramouli narayanan * rounds 5307c1da8d0Schandramouli narayanan * 60,62,64,66,68 5317c1da8d0Schandramouli narayanan * 70,72,74,76,78 5327c1da8d0Schandramouli narayanan */ 5337c1da8d0Schandramouli narayanan .rept 10 5347c1da8d0Schandramouli narayanan RR j 5357c1da8d0Schandramouli narayanan .set j, j+2 5367c1da8d0Schandramouli narayanan .endr 5377c1da8d0Schandramouli narayanan 5387c1da8d0Schandramouli narayanan UPDATE_HASH (HASH_PTR), A 5397c1da8d0Schandramouli narayanan UPDATE_HASH 4(HASH_PTR), TB 5407c1da8d0Schandramouli narayanan UPDATE_HASH 8(HASH_PTR), C 5417c1da8d0Schandramouli narayanan UPDATE_HASH 12(HASH_PTR), D 5427c1da8d0Schandramouli narayanan UPDATE_HASH 16(HASH_PTR), E 5437c1da8d0Schandramouli narayanan 5448861249cSmegha.dey@linux.intel.com test BLOCKS_CTR, BLOCKS_CTR 545*94330fbeSArd Biesheuvel jz .L_loop 5467c1da8d0Schandramouli narayanan 5477c1da8d0Schandramouli narayanan mov TB, B 5487c1da8d0Schandramouli narayanan 5497c1da8d0Schandramouli narayanan /* Process second block */ 5507c1da8d0Schandramouli narayanan /* 5517c1da8d0Schandramouli narayanan * rounds 5527c1da8d0Schandramouli narayanan * 0+80, 2+80, 4+80, 6+80, 8+80 5537c1da8d0Schandramouli narayanan * 10+80,12+80,14+80,16+80,18+80 5547c1da8d0Schandramouli narayanan */ 5557c1da8d0Schandramouli narayanan 5567c1da8d0Schandramouli narayanan .set j, 0 5577c1da8d0Schandramouli narayanan .rept 10 5587c1da8d0Schandramouli narayanan RR j+80 5597c1da8d0Schandramouli narayanan .set j, j+2 5607c1da8d0Schandramouli narayanan .endr 5617c1da8d0Schandramouli narayanan 5627c1da8d0Schandramouli narayanan /* 5637c1da8d0Schandramouli narayanan * rounds 5647c1da8d0Schandramouli narayanan * 20+80,22+80,24+80,26+80,28+80 5657c1da8d0Schandramouli narayanan * 30+80,32+80,34+80,36+80,38+80 5667c1da8d0Schandramouli narayanan */ 5677c1da8d0Schandramouli narayanan .rept 10 5687c1da8d0Schandramouli narayanan RR j+80 5697c1da8d0Schandramouli narayanan .set j, j+2 5707c1da8d0Schandramouli narayanan .endr 5717c1da8d0Schandramouli narayanan 5727c1da8d0Schandramouli narayanan /* 5737c1da8d0Schandramouli narayanan * rounds 5747c1da8d0Schandramouli narayanan * 40+80,42+80,44+80,46+80,48+80 5757c1da8d0Schandramouli narayanan * 50+80,52+80,54+80,56+80,58+80 5767c1da8d0Schandramouli narayanan */ 5777c1da8d0Schandramouli narayanan .rept 10 5787c1da8d0Schandramouli narayanan RR j+80 5797c1da8d0Schandramouli narayanan .set j, j+2 5807c1da8d0Schandramouli narayanan .endr 5817c1da8d0Schandramouli narayanan 5828861249cSmegha.dey@linux.intel.com /* update counter */ 5838861249cSmegha.dey@linux.intel.com sub $1, BLOCKS_CTR 5848861249cSmegha.dey@linux.intel.com /* Move to the next block only if needed*/ 5858861249cSmegha.dey@linux.intel.com ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 5867c1da8d0Schandramouli narayanan 5877c1da8d0Schandramouli narayanan /* 5887c1da8d0Schandramouli narayanan * rounds 5897c1da8d0Schandramouli narayanan * 60+80,62+80,64+80,66+80,68+80 5907c1da8d0Schandramouli narayanan * 70+80,72+80,74+80,76+80,78+80 5917c1da8d0Schandramouli narayanan */ 5927c1da8d0Schandramouli narayanan .rept 10 5937c1da8d0Schandramouli narayanan RR j+80 5947c1da8d0Schandramouli narayanan .set j, j+2 5957c1da8d0Schandramouli narayanan .endr 5967c1da8d0Schandramouli narayanan 5977c1da8d0Schandramouli narayanan UPDATE_HASH (HASH_PTR), A 5987c1da8d0Schandramouli narayanan UPDATE_HASH 4(HASH_PTR), TB 5997c1da8d0Schandramouli narayanan UPDATE_HASH 8(HASH_PTR), C 6007c1da8d0Schandramouli narayanan UPDATE_HASH 12(HASH_PTR), D 6017c1da8d0Schandramouli narayanan UPDATE_HASH 16(HASH_PTR), E 6027c1da8d0Schandramouli narayanan 6037c1da8d0Schandramouli narayanan /* Reset state for AVX2 reg permutation */ 6047c1da8d0Schandramouli narayanan mov A, TA 6057c1da8d0Schandramouli narayanan mov TB, A 6067c1da8d0Schandramouli narayanan mov C, TB 6077c1da8d0Schandramouli narayanan mov E, C 6087c1da8d0Schandramouli narayanan mov D, B 6097c1da8d0Schandramouli narayanan mov TA, D 6107c1da8d0Schandramouli narayanan 6117c1da8d0Schandramouli narayanan REGALLOC 6127c1da8d0Schandramouli narayanan 6137c1da8d0Schandramouli narayanan xchg WK_BUF, PRECALC_BUF 6147c1da8d0Schandramouli narayanan 615*94330fbeSArd Biesheuvel jmp .L_loop 6167c1da8d0Schandramouli narayanan 6177c1da8d0Schandramouli narayanan .align 32 618*94330fbeSArd Biesheuvel.L_end: 6197c1da8d0Schandramouli narayanan 6207c1da8d0Schandramouli narayanan.endm 6217c1da8d0Schandramouli narayanan/* 6227c1da8d0Schandramouli narayanan * macro implements SHA-1 function's body for several 64-byte blocks 6237c1da8d0Schandramouli narayanan * param: function's name 6247c1da8d0Schandramouli narayanan */ 6257c1da8d0Schandramouli narayanan.macro SHA1_VECTOR_ASM name 6266dcc5627SJiri Slaby SYM_FUNC_START(\name) 6277c1da8d0Schandramouli narayanan 6287c1da8d0Schandramouli narayanan push %rbx 6297c1da8d0Schandramouli narayanan push %r12 6307c1da8d0Schandramouli narayanan push %r13 6317c1da8d0Schandramouli narayanan push %r14 6327c1da8d0Schandramouli narayanan push %r15 6337c1da8d0Schandramouli narayanan 6347c1da8d0Schandramouli narayanan RESERVE_STACK = (W_SIZE*4 + 8+24) 6357c1da8d0Schandramouli narayanan 6367c1da8d0Schandramouli narayanan /* Align stack */ 63720114c89SJosh Poimboeuf push %rbp 63820114c89SJosh Poimboeuf mov %rsp, %rbp 6396c8c17ccSMathias Krause and $~(0x20-1), %rsp 6407c1da8d0Schandramouli narayanan sub $RESERVE_STACK, %rsp 6417c1da8d0Schandramouli narayanan 6427c1da8d0Schandramouli narayanan avx2_zeroupper 6437c1da8d0Schandramouli narayanan 6448861249cSmegha.dey@linux.intel.com /* Setup initial values */ 6457c1da8d0Schandramouli narayanan mov CTX, HASH_PTR 6467c1da8d0Schandramouli narayanan mov BUF, BUFFER_PTR 6477c1da8d0Schandramouli narayanan 6488861249cSmegha.dey@linux.intel.com mov BUF, BUFFER_PTR2 6498861249cSmegha.dey@linux.intel.com mov CNT, BLOCKS_CTR 6507c1da8d0Schandramouli narayanan 6517c1da8d0Schandramouli narayanan xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP 6527c1da8d0Schandramouli narayanan 6537c1da8d0Schandramouli narayanan SHA1_PIPELINED_MAIN_BODY 6547c1da8d0Schandramouli narayanan 6557c1da8d0Schandramouli narayanan avx2_zeroupper 6567c1da8d0Schandramouli narayanan 65720114c89SJosh Poimboeuf mov %rbp, %rsp 65820114c89SJosh Poimboeuf pop %rbp 6597c1da8d0Schandramouli narayanan 6607c1da8d0Schandramouli narayanan pop %r15 6617c1da8d0Schandramouli narayanan pop %r14 6627c1da8d0Schandramouli narayanan pop %r13 6637c1da8d0Schandramouli narayanan pop %r12 6647c1da8d0Schandramouli narayanan pop %rbx 6657c1da8d0Schandramouli narayanan 666f94909ceSPeter Zijlstra RET 6677c1da8d0Schandramouli narayanan 6686dcc5627SJiri Slaby SYM_FUNC_END(\name) 6697c1da8d0Schandramouli narayanan.endm 6707c1da8d0Schandramouli narayanan 6717c1da8d0Schandramouli narayanan.section .rodata 6727c1da8d0Schandramouli narayanan 6737c1da8d0Schandramouli narayanan#define K1 0x5a827999 6747c1da8d0Schandramouli narayanan#define K2 0x6ed9eba1 6757c1da8d0Schandramouli narayanan#define K3 0x8f1bbcdc 6767c1da8d0Schandramouli narayanan#define K4 0xca62c1d6 6777c1da8d0Schandramouli narayanan 6787c1da8d0Schandramouli narayanan.align 128 6797c1da8d0Schandramouli narayananK_XMM_AR: 6807c1da8d0Schandramouli narayanan .long K1, K1, K1, K1 6817c1da8d0Schandramouli narayanan .long K1, K1, K1, K1 6827c1da8d0Schandramouli narayanan .long K2, K2, K2, K2 6837c1da8d0Schandramouli narayanan .long K2, K2, K2, K2 6847c1da8d0Schandramouli narayanan .long K3, K3, K3, K3 6857c1da8d0Schandramouli narayanan .long K3, K3, K3, K3 6867c1da8d0Schandramouli narayanan .long K4, K4, K4, K4 6877c1da8d0Schandramouli narayanan .long K4, K4, K4, K4 6887c1da8d0Schandramouli narayanan 6897c1da8d0Schandramouli narayananBSWAP_SHUFB_CTL: 6907c1da8d0Schandramouli narayanan .long 0x00010203 6917c1da8d0Schandramouli narayanan .long 0x04050607 6927c1da8d0Schandramouli narayanan .long 0x08090a0b 6937c1da8d0Schandramouli narayanan .long 0x0c0d0e0f 6947c1da8d0Schandramouli narayanan .long 0x00010203 6957c1da8d0Schandramouli narayanan .long 0x04050607 6967c1da8d0Schandramouli narayanan .long 0x08090a0b 6977c1da8d0Schandramouli narayanan .long 0x0c0d0e0f 6987c1da8d0Schandramouli narayanan.text 6997c1da8d0Schandramouli narayanan 7007c1da8d0Schandramouli narayananSHA1_VECTOR_ASM sha1_transform_avx2 701