xref: /openbmc/linux/arch/x86/crypto/sha1_avx2_x86_64_asm.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
17c1da8d0Schandramouli narayanan/*
27c1da8d0Schandramouli narayanan *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
37c1da8d0Schandramouli narayanan *
47c1da8d0Schandramouli narayanan * This file is provided under a dual BSD/GPLv2 license.  When using or
57c1da8d0Schandramouli narayanan * redistributing this file, you may do so under either license.
67c1da8d0Schandramouli narayanan *
77c1da8d0Schandramouli narayanan * GPL LICENSE SUMMARY
87c1da8d0Schandramouli narayanan *
97c1da8d0Schandramouli narayanan * Copyright(c) 2014 Intel Corporation.
107c1da8d0Schandramouli narayanan *
117c1da8d0Schandramouli narayanan * This program is free software; you can redistribute it and/or modify
127c1da8d0Schandramouli narayanan * it under the terms of version 2 of the GNU General Public License as
137c1da8d0Schandramouli narayanan * published by the Free Software Foundation.
147c1da8d0Schandramouli narayanan *
157c1da8d0Schandramouli narayanan * This program is distributed in the hope that it will be useful, but
167c1da8d0Schandramouli narayanan * WITHOUT ANY WARRANTY; without even the implied warranty of
177c1da8d0Schandramouli narayanan * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
187c1da8d0Schandramouli narayanan * General Public License for more details.
197c1da8d0Schandramouli narayanan *
207c1da8d0Schandramouli narayanan * Contact Information:
217c1da8d0Schandramouli narayanan * Ilya Albrekht <ilya.albrekht@intel.com>
227c1da8d0Schandramouli narayanan * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
237c1da8d0Schandramouli narayanan * Ronen Zohar <ronen.zohar@intel.com>
247c1da8d0Schandramouli narayanan * Chandramouli Narayanan <mouli@linux.intel.com>
257c1da8d0Schandramouli narayanan *
267c1da8d0Schandramouli narayanan * BSD LICENSE
277c1da8d0Schandramouli narayanan *
287c1da8d0Schandramouli narayanan * Copyright(c) 2014 Intel Corporation.
297c1da8d0Schandramouli narayanan *
307c1da8d0Schandramouli narayanan * Redistribution and use in source and binary forms, with or without
317c1da8d0Schandramouli narayanan * modification, are permitted provided that the following conditions
327c1da8d0Schandramouli narayanan * are met:
337c1da8d0Schandramouli narayanan *
347c1da8d0Schandramouli narayanan * Redistributions of source code must retain the above copyright
357c1da8d0Schandramouli narayanan * notice, this list of conditions and the following disclaimer.
367c1da8d0Schandramouli narayanan * Redistributions in binary form must reproduce the above copyright
377c1da8d0Schandramouli narayanan * notice, this list of conditions and the following disclaimer in
387c1da8d0Schandramouli narayanan * the documentation and/or other materials provided with the
397c1da8d0Schandramouli narayanan * distribution.
407c1da8d0Schandramouli narayanan * Neither the name of Intel Corporation nor the names of its
417c1da8d0Schandramouli narayanan * contributors may be used to endorse or promote products derived
427c1da8d0Schandramouli narayanan * from this software without specific prior written permission.
437c1da8d0Schandramouli narayanan *
447c1da8d0Schandramouli narayanan * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
457c1da8d0Schandramouli narayanan * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
467c1da8d0Schandramouli narayanan * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
477c1da8d0Schandramouli narayanan * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
487c1da8d0Schandramouli narayanan * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
497c1da8d0Schandramouli narayanan * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
507c1da8d0Schandramouli narayanan * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
517c1da8d0Schandramouli narayanan * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
527c1da8d0Schandramouli narayanan * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
537c1da8d0Schandramouli narayanan * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
547c1da8d0Schandramouli narayanan * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
557c1da8d0Schandramouli narayanan *
567c1da8d0Schandramouli narayanan */
577c1da8d0Schandramouli narayanan
587c1da8d0Schandramouli narayanan/*
597c1da8d0Schandramouli narayanan * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
607c1da8d0Schandramouli narayanan *
617c1da8d0Schandramouli narayanan *This implementation is based on the previous SSSE3 release:
627c1da8d0Schandramouli narayanan *Visit http://software.intel.com/en-us/articles/
637c1da8d0Schandramouli narayanan *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
647c1da8d0Schandramouli narayanan *
6541419a28SKees Cook *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
6641419a28SKees Cook *even number of 'blocks' consecutive 64-byte blocks.
677c1da8d0Schandramouli narayanan *
687c1da8d0Schandramouli narayanan *extern "C" void sha1_transform_avx2(
6941419a28SKees Cook *	struct sha1_state *state, const u8* input, int blocks );
707c1da8d0Schandramouli narayanan */
717c1da8d0Schandramouli narayanan
727c1da8d0Schandramouli narayanan#include <linux/linkage.h>
737c1da8d0Schandramouli narayanan
747c1da8d0Schandramouli narayanan#define	CTX	%rdi	/* arg1 */
757c1da8d0Schandramouli narayanan#define BUF	%rsi	/* arg2 */
767c1da8d0Schandramouli narayanan#define CNT	%rdx	/* arg3 */
777c1da8d0Schandramouli narayanan
787c1da8d0Schandramouli narayanan#define	REG_A	%ecx
797c1da8d0Schandramouli narayanan#define	REG_B	%esi
807c1da8d0Schandramouli narayanan#define	REG_C	%edi
817c1da8d0Schandramouli narayanan#define	REG_D	%eax
827c1da8d0Schandramouli narayanan#define	REG_E	%edx
837c1da8d0Schandramouli narayanan#define	REG_TB	%ebx
847c1da8d0Schandramouli narayanan#define	REG_TA	%r12d
857c1da8d0Schandramouli narayanan#define	REG_RA	%rcx
867c1da8d0Schandramouli narayanan#define	REG_RB	%rsi
877c1da8d0Schandramouli narayanan#define	REG_RC	%rdi
887c1da8d0Schandramouli narayanan#define	REG_RD	%rax
897c1da8d0Schandramouli narayanan#define	REG_RE	%rdx
907c1da8d0Schandramouli narayanan#define	REG_RTA	%r12
917c1da8d0Schandramouli narayanan#define	REG_RTB	%rbx
92d7b1722cSJosh Poimboeuf#define	REG_T1	%r11d
937c1da8d0Schandramouli narayanan#define	xmm_mov	vmovups
947c1da8d0Schandramouli narayanan#define	avx2_zeroupper	vzeroupper
957c1da8d0Schandramouli narayanan#define	RND_F1	1
967c1da8d0Schandramouli narayanan#define	RND_F2	2
977c1da8d0Schandramouli narayanan#define	RND_F3	3
987c1da8d0Schandramouli narayanan
997c1da8d0Schandramouli narayanan.macro REGALLOC
1007c1da8d0Schandramouli narayanan	.set A, REG_A
1017c1da8d0Schandramouli narayanan	.set B, REG_B
1027c1da8d0Schandramouli narayanan	.set C, REG_C
1037c1da8d0Schandramouli narayanan	.set D, REG_D
1047c1da8d0Schandramouli narayanan	.set E, REG_E
1057c1da8d0Schandramouli narayanan	.set TB, REG_TB
1067c1da8d0Schandramouli narayanan	.set TA, REG_TA
1077c1da8d0Schandramouli narayanan
1087c1da8d0Schandramouli narayanan	.set RA, REG_RA
1097c1da8d0Schandramouli narayanan	.set RB, REG_RB
1107c1da8d0Schandramouli narayanan	.set RC, REG_RC
1117c1da8d0Schandramouli narayanan	.set RD, REG_RD
1127c1da8d0Schandramouli narayanan	.set RE, REG_RE
1137c1da8d0Schandramouli narayanan
1147c1da8d0Schandramouli narayanan	.set RTA, REG_RTA
1157c1da8d0Schandramouli narayanan	.set RTB, REG_RTB
1167c1da8d0Schandramouli narayanan
1177c1da8d0Schandramouli narayanan	.set T1, REG_T1
1187c1da8d0Schandramouli narayanan.endm
1197c1da8d0Schandramouli narayanan
1207c1da8d0Schandramouli narayanan#define HASH_PTR	%r9
1218861249cSmegha.dey@linux.intel.com#define BLOCKS_CTR	%r8
1227c1da8d0Schandramouli narayanan#define BUFFER_PTR	%r10
1237c1da8d0Schandramouli narayanan#define BUFFER_PTR2	%r13
1247c1da8d0Schandramouli narayanan
1257c1da8d0Schandramouli narayanan#define PRECALC_BUF	%r14
1267c1da8d0Schandramouli narayanan#define WK_BUF		%r15
1277c1da8d0Schandramouli narayanan
1287c1da8d0Schandramouli narayanan#define W_TMP		%xmm0
1297c1da8d0Schandramouli narayanan#define WY_TMP		%ymm0
1307c1da8d0Schandramouli narayanan#define WY_TMP2		%ymm9
1317c1da8d0Schandramouli narayanan
1327c1da8d0Schandramouli narayanan# AVX2 variables
1337c1da8d0Schandramouli narayanan#define WY0		%ymm3
1347c1da8d0Schandramouli narayanan#define WY4		%ymm5
1357c1da8d0Schandramouli narayanan#define WY08		%ymm7
1367c1da8d0Schandramouli narayanan#define WY12		%ymm8
1377c1da8d0Schandramouli narayanan#define WY16		%ymm12
1387c1da8d0Schandramouli narayanan#define WY20		%ymm13
1397c1da8d0Schandramouli narayanan#define WY24		%ymm14
1407c1da8d0Schandramouli narayanan#define WY28		%ymm15
1417c1da8d0Schandramouli narayanan
1427c1da8d0Schandramouli narayanan#define YMM_SHUFB_BSWAP	%ymm10
1437c1da8d0Schandramouli narayanan
1447c1da8d0Schandramouli narayanan/*
1457c1da8d0Schandramouli narayanan * Keep 2 iterations precalculated at a time:
1467c1da8d0Schandramouli narayanan *    - 80 DWORDs per iteration * 2
1477c1da8d0Schandramouli narayanan */
1487c1da8d0Schandramouli narayanan#define W_SIZE		(80*2*2 +16)
1497c1da8d0Schandramouli narayanan
1507c1da8d0Schandramouli narayanan#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
1517c1da8d0Schandramouli narayanan#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
1527c1da8d0Schandramouli narayanan
1537c1da8d0Schandramouli narayanan
1547c1da8d0Schandramouli narayanan.macro UPDATE_HASH  hash, val
1557c1da8d0Schandramouli narayanan	add	\hash, \val
1567c1da8d0Schandramouli narayanan	mov	\val, \hash
1577c1da8d0Schandramouli narayanan.endm
1587c1da8d0Schandramouli narayanan
1597c1da8d0Schandramouli narayanan.macro PRECALC_RESET_WY
1607c1da8d0Schandramouli narayanan	.set WY_00, WY0
1617c1da8d0Schandramouli narayanan	.set WY_04, WY4
1627c1da8d0Schandramouli narayanan	.set WY_08, WY08
1637c1da8d0Schandramouli narayanan	.set WY_12, WY12
1647c1da8d0Schandramouli narayanan	.set WY_16, WY16
1657c1da8d0Schandramouli narayanan	.set WY_20, WY20
1667c1da8d0Schandramouli narayanan	.set WY_24, WY24
1677c1da8d0Schandramouli narayanan	.set WY_28, WY28
1687c1da8d0Schandramouli narayanan	.set WY_32, WY_00
1697c1da8d0Schandramouli narayanan.endm
1707c1da8d0Schandramouli narayanan
1717c1da8d0Schandramouli narayanan.macro PRECALC_ROTATE_WY
1727c1da8d0Schandramouli narayanan	/* Rotate macros */
1737c1da8d0Schandramouli narayanan	.set WY_32, WY_28
1747c1da8d0Schandramouli narayanan	.set WY_28, WY_24
1757c1da8d0Schandramouli narayanan	.set WY_24, WY_20
1767c1da8d0Schandramouli narayanan	.set WY_20, WY_16
1777c1da8d0Schandramouli narayanan	.set WY_16, WY_12
1787c1da8d0Schandramouli narayanan	.set WY_12, WY_08
1797c1da8d0Schandramouli narayanan	.set WY_08, WY_04
1807c1da8d0Schandramouli narayanan	.set WY_04, WY_00
1817c1da8d0Schandramouli narayanan	.set WY_00, WY_32
1827c1da8d0Schandramouli narayanan
1837c1da8d0Schandramouli narayanan	/* Define register aliases */
1847c1da8d0Schandramouli narayanan	.set WY, WY_00
1857c1da8d0Schandramouli narayanan	.set WY_minus_04, WY_04
1867c1da8d0Schandramouli narayanan	.set WY_minus_08, WY_08
1877c1da8d0Schandramouli narayanan	.set WY_minus_12, WY_12
1887c1da8d0Schandramouli narayanan	.set WY_minus_16, WY_16
1897c1da8d0Schandramouli narayanan	.set WY_minus_20, WY_20
1907c1da8d0Schandramouli narayanan	.set WY_minus_24, WY_24
1917c1da8d0Schandramouli narayanan	.set WY_minus_28, WY_28
1927c1da8d0Schandramouli narayanan	.set WY_minus_32, WY
1937c1da8d0Schandramouli narayanan.endm
1947c1da8d0Schandramouli narayanan
1957c1da8d0Schandramouli narayanan.macro PRECALC_00_15
1967c1da8d0Schandramouli narayanan	.if (i == 0) # Initialize and rotate registers
1977c1da8d0Schandramouli narayanan		PRECALC_RESET_WY
1987c1da8d0Schandramouli narayanan		PRECALC_ROTATE_WY
1997c1da8d0Schandramouli narayanan	.endif
2007c1da8d0Schandramouli narayanan
2017c1da8d0Schandramouli narayanan	/* message scheduling pre-compute for rounds 0-15 */
2027c1da8d0Schandramouli narayanan	.if   ((i & 7) == 0)
2037c1da8d0Schandramouli narayanan		/*
2047c1da8d0Schandramouli narayanan		 * blended AVX2 and ALU instruction scheduling
2057c1da8d0Schandramouli narayanan		 * 1 vector iteration per 8 rounds
2067c1da8d0Schandramouli narayanan		 */
2078861249cSmegha.dey@linux.intel.com		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
2087c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 1)
2098861249cSmegha.dey@linux.intel.com		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
2107c1da8d0Schandramouli narayanan			 WY_TMP, WY_TMP
2117c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 2)
2127c1da8d0Schandramouli narayanan		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
2137c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 4)
2148861249cSmegha.dey@linux.intel.com		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
2157c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 7)
2167c1da8d0Schandramouli narayanan		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
2177c1da8d0Schandramouli narayanan
2187c1da8d0Schandramouli narayanan		PRECALC_ROTATE_WY
2197c1da8d0Schandramouli narayanan	.endif
2207c1da8d0Schandramouli narayanan.endm
2217c1da8d0Schandramouli narayanan
2227c1da8d0Schandramouli narayanan.macro PRECALC_16_31
2237c1da8d0Schandramouli narayanan	/*
2247c1da8d0Schandramouli narayanan	 * message scheduling pre-compute for rounds 16-31
2257c1da8d0Schandramouli narayanan	 * calculating last 32 w[i] values in 8 XMM registers
2267c1da8d0Schandramouli narayanan	 * pre-calculate K+w[i] values and store to mem
2277c1da8d0Schandramouli narayanan	 * for later load by ALU add instruction
2287c1da8d0Schandramouli narayanan	 *
2297c1da8d0Schandramouli narayanan	 * "brute force" vectorization for rounds 16-31 only
2307c1da8d0Schandramouli narayanan	 * due to w[i]->w[i-3] dependency
2317c1da8d0Schandramouli narayanan	 */
2327c1da8d0Schandramouli narayanan	.if   ((i & 7) == 0)
2337c1da8d0Schandramouli narayanan		/*
2347c1da8d0Schandramouli narayanan		 * blended AVX2 and ALU instruction scheduling
2357c1da8d0Schandramouli narayanan		 * 1 vector iteration per 8 rounds
2367c1da8d0Schandramouli narayanan		 */
2377c1da8d0Schandramouli narayanan		/* w[i-14] */
2387c1da8d0Schandramouli narayanan		vpalignr	$8, WY_minus_16, WY_minus_12, WY
2397c1da8d0Schandramouli narayanan		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
2407c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 1)
2417c1da8d0Schandramouli narayanan		vpxor	WY_minus_08, WY, WY
2427c1da8d0Schandramouli narayanan		vpxor	WY_minus_16, WY_TMP, WY_TMP
2437c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 2)
2447c1da8d0Schandramouli narayanan		vpxor	WY_TMP, WY, WY
2457c1da8d0Schandramouli narayanan		vpslldq	$12, WY, WY_TMP2
2467c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 3)
2477c1da8d0Schandramouli narayanan		vpslld	$1, WY, WY_TMP
2487c1da8d0Schandramouli narayanan		vpsrld	$31, WY, WY
2497c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 4)
2507c1da8d0Schandramouli narayanan		vpor	WY, WY_TMP, WY_TMP
2517c1da8d0Schandramouli narayanan		vpslld	$2, WY_TMP2, WY
2527c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 5)
2537c1da8d0Schandramouli narayanan		vpsrld	$30, WY_TMP2, WY_TMP2
2547c1da8d0Schandramouli narayanan		vpxor	WY, WY_TMP, WY_TMP
2557c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 7)
2567c1da8d0Schandramouli narayanan		vpxor	WY_TMP2, WY_TMP, WY
2578861249cSmegha.dey@linux.intel.com		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
2587c1da8d0Schandramouli narayanan		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
2597c1da8d0Schandramouli narayanan
2607c1da8d0Schandramouli narayanan		PRECALC_ROTATE_WY
2617c1da8d0Schandramouli narayanan	.endif
2627c1da8d0Schandramouli narayanan.endm
2637c1da8d0Schandramouli narayanan
2647c1da8d0Schandramouli narayanan.macro PRECALC_32_79
2657c1da8d0Schandramouli narayanan	/*
2667c1da8d0Schandramouli narayanan	 * in SHA-1 specification:
2677c1da8d0Schandramouli narayanan	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
2687c1da8d0Schandramouli narayanan	 * instead we do equal:
2697c1da8d0Schandramouli narayanan	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
2707c1da8d0Schandramouli narayanan	 * allows more efficient vectorization
2717c1da8d0Schandramouli narayanan	 * since w[i]=>w[i-3] dependency is broken
2727c1da8d0Schandramouli narayanan	 */
2737c1da8d0Schandramouli narayanan
2747c1da8d0Schandramouli narayanan	.if   ((i & 7) == 0)
2757c1da8d0Schandramouli narayanan	/*
2767c1da8d0Schandramouli narayanan	 * blended AVX2 and ALU instruction scheduling
2777c1da8d0Schandramouli narayanan	 * 1 vector iteration per 8 rounds
2787c1da8d0Schandramouli narayanan	 */
2797c1da8d0Schandramouli narayanan		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
2807c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 1)
2817c1da8d0Schandramouli narayanan		/* W is W_minus_32 before xor */
2827c1da8d0Schandramouli narayanan		vpxor	WY_minus_28, WY, WY
2837c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 2)
2847c1da8d0Schandramouli narayanan		vpxor	WY_minus_16, WY_TMP, WY_TMP
2857c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 3)
2867c1da8d0Schandramouli narayanan		vpxor	WY_TMP, WY, WY
2877c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 4)
2887c1da8d0Schandramouli narayanan		vpslld	$2, WY, WY_TMP
2897c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 5)
2907c1da8d0Schandramouli narayanan		vpsrld	$30, WY, WY
2917c1da8d0Schandramouli narayanan		vpor	WY, WY_TMP, WY
2927c1da8d0Schandramouli narayanan	.elseif ((i & 7) == 7)
2938861249cSmegha.dey@linux.intel.com		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
2947c1da8d0Schandramouli narayanan		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
2957c1da8d0Schandramouli narayanan
2967c1da8d0Schandramouli narayanan		PRECALC_ROTATE_WY
2977c1da8d0Schandramouli narayanan	.endif
2987c1da8d0Schandramouli narayanan.endm
2997c1da8d0Schandramouli narayanan
3007c1da8d0Schandramouli narayanan.macro PRECALC r, s
3017c1da8d0Schandramouli narayanan	.set i, \r
3027c1da8d0Schandramouli narayanan
3037c1da8d0Schandramouli narayanan	.if (i < 40)
3047c1da8d0Schandramouli narayanan		.set K_XMM, 32*0
3057c1da8d0Schandramouli narayanan	.elseif (i < 80)
3067c1da8d0Schandramouli narayanan		.set K_XMM, 32*1
3077c1da8d0Schandramouli narayanan	.elseif (i < 120)
3087c1da8d0Schandramouli narayanan		.set K_XMM, 32*2
3097c1da8d0Schandramouli narayanan	.else
3107c1da8d0Schandramouli narayanan		.set K_XMM, 32*3
3117c1da8d0Schandramouli narayanan	.endif
3127c1da8d0Schandramouli narayanan
3137c1da8d0Schandramouli narayanan	.if (i<32)
3147c1da8d0Schandramouli narayanan		PRECALC_00_15	\s
3157c1da8d0Schandramouli narayanan	.elseif (i<64)
3167c1da8d0Schandramouli narayanan		PRECALC_16_31	\s
3177c1da8d0Schandramouli narayanan	.elseif (i < 160)
3187c1da8d0Schandramouli narayanan		PRECALC_32_79	\s
3197c1da8d0Schandramouli narayanan	.endif
3207c1da8d0Schandramouli narayanan.endm
3217c1da8d0Schandramouli narayanan
3227c1da8d0Schandramouli narayanan.macro ROTATE_STATE
3237c1da8d0Schandramouli narayanan	.set T_REG, E
3247c1da8d0Schandramouli narayanan	.set E, D
3257c1da8d0Schandramouli narayanan	.set D, C
3267c1da8d0Schandramouli narayanan	.set C, B
3277c1da8d0Schandramouli narayanan	.set B, TB
3287c1da8d0Schandramouli narayanan	.set TB, A
3297c1da8d0Schandramouli narayanan	.set A, T_REG
3307c1da8d0Schandramouli narayanan
3317c1da8d0Schandramouli narayanan	.set T_REG, RE
3327c1da8d0Schandramouli narayanan	.set RE, RD
3337c1da8d0Schandramouli narayanan	.set RD, RC
3347c1da8d0Schandramouli narayanan	.set RC, RB
3357c1da8d0Schandramouli narayanan	.set RB, RTB
3367c1da8d0Schandramouli narayanan	.set RTB, RA
3377c1da8d0Schandramouli narayanan	.set RA, T_REG
3387c1da8d0Schandramouli narayanan.endm
3397c1da8d0Schandramouli narayanan
3407c1da8d0Schandramouli narayanan/* Macro relies on saved ROUND_Fx */
3417c1da8d0Schandramouli narayanan
3427c1da8d0Schandramouli narayanan.macro RND_FUN f, r
3437c1da8d0Schandramouli narayanan	.if (\f == RND_F1)
3447c1da8d0Schandramouli narayanan		ROUND_F1	\r
3457c1da8d0Schandramouli narayanan	.elseif (\f == RND_F2)
3467c1da8d0Schandramouli narayanan		ROUND_F2	\r
3477c1da8d0Schandramouli narayanan	.elseif (\f == RND_F3)
3487c1da8d0Schandramouli narayanan		ROUND_F3	\r
3497c1da8d0Schandramouli narayanan	.endif
3507c1da8d0Schandramouli narayanan.endm
3517c1da8d0Schandramouli narayanan
3527c1da8d0Schandramouli narayanan.macro RR r
3537c1da8d0Schandramouli narayanan	.set round_id, (\r % 80)
3547c1da8d0Schandramouli narayanan
3557c1da8d0Schandramouli narayanan	.if (round_id == 0)        /* Precalculate F for first round */
3567c1da8d0Schandramouli narayanan		.set ROUND_FUNC, RND_F1
3577c1da8d0Schandramouli narayanan		mov	B, TB
3587c1da8d0Schandramouli narayanan
3597c1da8d0Schandramouli narayanan		rorx	$(32-30), B, B    /* b>>>2 */
3607c1da8d0Schandramouli narayanan		andn	D, TB, T1
3617c1da8d0Schandramouli narayanan		and	C, TB
3627c1da8d0Schandramouli narayanan		xor	T1, TB
3637c1da8d0Schandramouli narayanan	.endif
3647c1da8d0Schandramouli narayanan
3657c1da8d0Schandramouli narayanan	RND_FUN ROUND_FUNC, \r
3667c1da8d0Schandramouli narayanan	ROTATE_STATE
3677c1da8d0Schandramouli narayanan
3687c1da8d0Schandramouli narayanan	.if   (round_id == 18)
3697c1da8d0Schandramouli narayanan		.set ROUND_FUNC, RND_F2
3707c1da8d0Schandramouli narayanan	.elseif (round_id == 38)
3717c1da8d0Schandramouli narayanan		.set ROUND_FUNC, RND_F3
3727c1da8d0Schandramouli narayanan	.elseif (round_id == 58)
3737c1da8d0Schandramouli narayanan		.set ROUND_FUNC, RND_F2
3747c1da8d0Schandramouli narayanan	.endif
3757c1da8d0Schandramouli narayanan
3767c1da8d0Schandramouli narayanan	.set round_id, ( (\r+1) % 80)
3777c1da8d0Schandramouli narayanan
3787c1da8d0Schandramouli narayanan	RND_FUN ROUND_FUNC, (\r+1)
3797c1da8d0Schandramouli narayanan	ROTATE_STATE
3807c1da8d0Schandramouli narayanan.endm
3817c1da8d0Schandramouli narayanan
3827c1da8d0Schandramouli narayanan.macro ROUND_F1 r
3837c1da8d0Schandramouli narayanan	add	WK(\r), E
3847c1da8d0Schandramouli narayanan
3857c1da8d0Schandramouli narayanan	andn	C, A, T1			/* ~b&d */
3867c1da8d0Schandramouli narayanan	lea	(RE,RTB), E		/* Add F from the previous round */
3877c1da8d0Schandramouli narayanan
3887c1da8d0Schandramouli narayanan	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
3897c1da8d0Schandramouli narayanan	rorx	$(32-30),A, TB		/* b>>>2 for next round */
3907c1da8d0Schandramouli narayanan
3917c1da8d0Schandramouli narayanan	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
3927c1da8d0Schandramouli narayanan
3937c1da8d0Schandramouli narayanan	/*
3947c1da8d0Schandramouli narayanan	 * Calculate F for the next round
3957c1da8d0Schandramouli narayanan	 * (b & c) ^ andn[b, d]
3967c1da8d0Schandramouli narayanan	 */
3977c1da8d0Schandramouli narayanan	and	B, A			/* b&c */
3987c1da8d0Schandramouli narayanan	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
3997c1da8d0Schandramouli narayanan
4007c1da8d0Schandramouli narayanan	lea	(RE,RTA), E		/* E += A >>> 5 */
4017c1da8d0Schandramouli narayanan.endm
4027c1da8d0Schandramouli narayanan
4037c1da8d0Schandramouli narayanan.macro ROUND_F2 r
4047c1da8d0Schandramouli narayanan	add	WK(\r), E
4057c1da8d0Schandramouli narayanan	lea	(RE,RTB), E		/* Add F from the previous round */
4067c1da8d0Schandramouli narayanan
4077c1da8d0Schandramouli narayanan	/* Calculate F for the next round */
4087c1da8d0Schandramouli narayanan	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
4097c1da8d0Schandramouli narayanan	.if ((round_id) < 79)
4107c1da8d0Schandramouli narayanan		rorx	$(32-30), A, TB	/* b>>>2 for next round */
4117c1da8d0Schandramouli narayanan	.endif
4127c1da8d0Schandramouli narayanan	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
4137c1da8d0Schandramouli narayanan
4147c1da8d0Schandramouli narayanan	.if ((round_id) < 79)
4157c1da8d0Schandramouli narayanan		xor	B, A
4167c1da8d0Schandramouli narayanan	.endif
4177c1da8d0Schandramouli narayanan
4187c1da8d0Schandramouli narayanan	add	TA, E			/* E += A >>> 5 */
4197c1da8d0Schandramouli narayanan
4207c1da8d0Schandramouli narayanan	.if ((round_id) < 79)
4217c1da8d0Schandramouli narayanan		xor	C, A
4227c1da8d0Schandramouli narayanan	.endif
4237c1da8d0Schandramouli narayanan.endm
4247c1da8d0Schandramouli narayanan
4257c1da8d0Schandramouli narayanan.macro ROUND_F3 r
4267c1da8d0Schandramouli narayanan	add	WK(\r), E
4277c1da8d0Schandramouli narayanan	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
4287c1da8d0Schandramouli narayanan
4297c1da8d0Schandramouli narayanan	lea	(RE,RTB), E		/* Add F from the previous round */
4307c1da8d0Schandramouli narayanan
4317c1da8d0Schandramouli narayanan	mov	B, T1
4327c1da8d0Schandramouli narayanan	or	A, T1
4337c1da8d0Schandramouli narayanan
4347c1da8d0Schandramouli narayanan	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
4357c1da8d0Schandramouli narayanan	rorx	$(32-30), A, TB		/* b>>>2 for next round */
4367c1da8d0Schandramouli narayanan
4377c1da8d0Schandramouli narayanan	/* Calculate F for the next round
4387c1da8d0Schandramouli narayanan	 * (b and c) or (d and (b or c))
4397c1da8d0Schandramouli narayanan	 */
4407c1da8d0Schandramouli narayanan	and	C, T1
4417c1da8d0Schandramouli narayanan	and	B, A
4427c1da8d0Schandramouli narayanan	or	T1, A
4437c1da8d0Schandramouli narayanan
4447c1da8d0Schandramouli narayanan	add	TA, E			/* E += A >>> 5 */
4457c1da8d0Schandramouli narayanan
4467c1da8d0Schandramouli narayanan.endm
4477c1da8d0Schandramouli narayanan
4488861249cSmegha.dey@linux.intel.com/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
4498861249cSmegha.dey@linux.intel.com * %1 + %2 >= %3 ? %4 : 0
4508861249cSmegha.dey@linux.intel.com */
4518861249cSmegha.dey@linux.intel.com.macro ADD_IF_GE a, b, c, d
4528861249cSmegha.dey@linux.intel.com	mov     \a, RTA
4538861249cSmegha.dey@linux.intel.com	add     $\d, RTA
4548861249cSmegha.dey@linux.intel.com	cmp     $\c, \b
4558861249cSmegha.dey@linux.intel.com	cmovge  RTA, \a
4568861249cSmegha.dey@linux.intel.com.endm
4578861249cSmegha.dey@linux.intel.com
4587c1da8d0Schandramouli narayanan/*
4597c1da8d0Schandramouli narayanan * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
4607c1da8d0Schandramouli narayanan */
4617c1da8d0Schandramouli narayanan.macro SHA1_PIPELINED_MAIN_BODY
4627c1da8d0Schandramouli narayanan
4637c1da8d0Schandramouli narayanan	REGALLOC
4647c1da8d0Schandramouli narayanan
4657c1da8d0Schandramouli narayanan	mov	(HASH_PTR), A
4667c1da8d0Schandramouli narayanan	mov	4(HASH_PTR), B
4677c1da8d0Schandramouli narayanan	mov	8(HASH_PTR), C
4687c1da8d0Schandramouli narayanan	mov	12(HASH_PTR), D
4697c1da8d0Schandramouli narayanan	mov	16(HASH_PTR), E
4707c1da8d0Schandramouli narayanan
4717c1da8d0Schandramouli narayanan	mov	%rsp, PRECALC_BUF
4727c1da8d0Schandramouli narayanan	lea	(2*4*80+32)(%rsp), WK_BUF
4737c1da8d0Schandramouli narayanan
4747c1da8d0Schandramouli narayanan	# Precalc WK for first 2 blocks
4758861249cSmegha.dey@linux.intel.com	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
4767c1da8d0Schandramouli narayanan	.set i, 0
4777c1da8d0Schandramouli narayanan	.rept    160
4787c1da8d0Schandramouli narayanan		PRECALC i
4797c1da8d0Schandramouli narayanan		.set i, i + 1
4807c1da8d0Schandramouli narayanan	.endr
4818861249cSmegha.dey@linux.intel.com
4828861249cSmegha.dey@linux.intel.com	/* Go to next block if needed */
4838861249cSmegha.dey@linux.intel.com	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
4848861249cSmegha.dey@linux.intel.com	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
4857c1da8d0Schandramouli narayanan	xchg	WK_BUF, PRECALC_BUF
4867c1da8d0Schandramouli narayanan
4877c1da8d0Schandramouli narayanan	.align 32
488*94330fbeSArd Biesheuvel.L_loop:
4897c1da8d0Schandramouli narayanan	/*
4907c1da8d0Schandramouli narayanan	 * code loops through more than one block
4917c1da8d0Schandramouli narayanan	 * we use K_BASE value as a signal of a last block,
4927c1da8d0Schandramouli narayanan	 * it is set below by: cmovae BUFFER_PTR, K_BASE
4937c1da8d0Schandramouli narayanan	 */
4948861249cSmegha.dey@linux.intel.com	test BLOCKS_CTR, BLOCKS_CTR
495*94330fbeSArd Biesheuvel	jnz .L_begin
4967c1da8d0Schandramouli narayanan	.align 32
497*94330fbeSArd Biesheuvel	jmp	.L_end
4987c1da8d0Schandramouli narayanan	.align 32
499*94330fbeSArd Biesheuvel.L_begin:
5007c1da8d0Schandramouli narayanan
5017c1da8d0Schandramouli narayanan	/*
5027c1da8d0Schandramouli narayanan	 * Do first block
5037c1da8d0Schandramouli narayanan	 * rounds: 0,2,4,6,8
5047c1da8d0Schandramouli narayanan	 */
5057c1da8d0Schandramouli narayanan	.set j, 0
5067c1da8d0Schandramouli narayanan	.rept 5
5077c1da8d0Schandramouli narayanan		RR	j
5087c1da8d0Schandramouli narayanan		.set j, j+2
5097c1da8d0Schandramouli narayanan	.endr
5107c1da8d0Schandramouli narayanan
5117c1da8d0Schandramouli narayanan	/*
5127c1da8d0Schandramouli narayanan	 * rounds:
5137c1da8d0Schandramouli narayanan	 * 10,12,14,16,18
5147c1da8d0Schandramouli narayanan	 * 20,22,24,26,28
5157c1da8d0Schandramouli narayanan	 * 30,32,34,36,38
5167c1da8d0Schandramouli narayanan	 * 40,42,44,46,48
5177c1da8d0Schandramouli narayanan	 * 50,52,54,56,58
5187c1da8d0Schandramouli narayanan	 */
5197c1da8d0Schandramouli narayanan	.rept 25
5207c1da8d0Schandramouli narayanan		RR	j
5217c1da8d0Schandramouli narayanan		.set j, j+2
5227c1da8d0Schandramouli narayanan	.endr
5237c1da8d0Schandramouli narayanan
5248861249cSmegha.dey@linux.intel.com	/* Update Counter */
5258861249cSmegha.dey@linux.intel.com	sub $1, BLOCKS_CTR
5268861249cSmegha.dey@linux.intel.com	/* Move to the next block only if needed*/
5278861249cSmegha.dey@linux.intel.com	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
5287c1da8d0Schandramouli narayanan	/*
5297c1da8d0Schandramouli narayanan	 * rounds
5307c1da8d0Schandramouli narayanan	 * 60,62,64,66,68
5317c1da8d0Schandramouli narayanan	 * 70,72,74,76,78
5327c1da8d0Schandramouli narayanan	 */
5337c1da8d0Schandramouli narayanan	.rept 10
5347c1da8d0Schandramouli narayanan		RR	j
5357c1da8d0Schandramouli narayanan		.set j, j+2
5367c1da8d0Schandramouli narayanan	.endr
5377c1da8d0Schandramouli narayanan
5387c1da8d0Schandramouli narayanan	UPDATE_HASH	(HASH_PTR), A
5397c1da8d0Schandramouli narayanan	UPDATE_HASH	4(HASH_PTR), TB
5407c1da8d0Schandramouli narayanan	UPDATE_HASH	8(HASH_PTR), C
5417c1da8d0Schandramouli narayanan	UPDATE_HASH	12(HASH_PTR), D
5427c1da8d0Schandramouli narayanan	UPDATE_HASH	16(HASH_PTR), E
5437c1da8d0Schandramouli narayanan
5448861249cSmegha.dey@linux.intel.com	test	BLOCKS_CTR, BLOCKS_CTR
545*94330fbeSArd Biesheuvel	jz	.L_loop
5467c1da8d0Schandramouli narayanan
5477c1da8d0Schandramouli narayanan	mov	TB, B
5487c1da8d0Schandramouli narayanan
5497c1da8d0Schandramouli narayanan	/* Process second block */
5507c1da8d0Schandramouli narayanan	/*
5517c1da8d0Schandramouli narayanan	 * rounds
5527c1da8d0Schandramouli narayanan	 *  0+80, 2+80, 4+80, 6+80, 8+80
5537c1da8d0Schandramouli narayanan	 * 10+80,12+80,14+80,16+80,18+80
5547c1da8d0Schandramouli narayanan	 */
5557c1da8d0Schandramouli narayanan
5567c1da8d0Schandramouli narayanan	.set j, 0
5577c1da8d0Schandramouli narayanan	.rept 10
5587c1da8d0Schandramouli narayanan		RR	j+80
5597c1da8d0Schandramouli narayanan		.set j, j+2
5607c1da8d0Schandramouli narayanan	.endr
5617c1da8d0Schandramouli narayanan
5627c1da8d0Schandramouli narayanan	/*
5637c1da8d0Schandramouli narayanan	 * rounds
5647c1da8d0Schandramouli narayanan	 * 20+80,22+80,24+80,26+80,28+80
5657c1da8d0Schandramouli narayanan	 * 30+80,32+80,34+80,36+80,38+80
5667c1da8d0Schandramouli narayanan	 */
5677c1da8d0Schandramouli narayanan	.rept 10
5687c1da8d0Schandramouli narayanan		RR	j+80
5697c1da8d0Schandramouli narayanan		.set j, j+2
5707c1da8d0Schandramouli narayanan	.endr
5717c1da8d0Schandramouli narayanan
5727c1da8d0Schandramouli narayanan	/*
5737c1da8d0Schandramouli narayanan	 * rounds
5747c1da8d0Schandramouli narayanan	 * 40+80,42+80,44+80,46+80,48+80
5757c1da8d0Schandramouli narayanan	 * 50+80,52+80,54+80,56+80,58+80
5767c1da8d0Schandramouli narayanan	 */
5777c1da8d0Schandramouli narayanan	.rept 10
5787c1da8d0Schandramouli narayanan		RR	j+80
5797c1da8d0Schandramouli narayanan		.set j, j+2
5807c1da8d0Schandramouli narayanan	.endr
5817c1da8d0Schandramouli narayanan
5828861249cSmegha.dey@linux.intel.com	/* update counter */
5838861249cSmegha.dey@linux.intel.com	sub     $1, BLOCKS_CTR
5848861249cSmegha.dey@linux.intel.com	/* Move to the next block only if needed*/
5858861249cSmegha.dey@linux.intel.com	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
5867c1da8d0Schandramouli narayanan
5877c1da8d0Schandramouli narayanan	/*
5887c1da8d0Schandramouli narayanan	 * rounds
5897c1da8d0Schandramouli narayanan	 * 60+80,62+80,64+80,66+80,68+80
5907c1da8d0Schandramouli narayanan	 * 70+80,72+80,74+80,76+80,78+80
5917c1da8d0Schandramouli narayanan	 */
5927c1da8d0Schandramouli narayanan	.rept 10
5937c1da8d0Schandramouli narayanan		RR	j+80
5947c1da8d0Schandramouli narayanan		.set j, j+2
5957c1da8d0Schandramouli narayanan	.endr
5967c1da8d0Schandramouli narayanan
5977c1da8d0Schandramouli narayanan	UPDATE_HASH	(HASH_PTR), A
5987c1da8d0Schandramouli narayanan	UPDATE_HASH	4(HASH_PTR), TB
5997c1da8d0Schandramouli narayanan	UPDATE_HASH	8(HASH_PTR), C
6007c1da8d0Schandramouli narayanan	UPDATE_HASH	12(HASH_PTR), D
6017c1da8d0Schandramouli narayanan	UPDATE_HASH	16(HASH_PTR), E
6027c1da8d0Schandramouli narayanan
6037c1da8d0Schandramouli narayanan	/* Reset state for AVX2 reg permutation */
6047c1da8d0Schandramouli narayanan	mov	A, TA
6057c1da8d0Schandramouli narayanan	mov	TB, A
6067c1da8d0Schandramouli narayanan	mov	C, TB
6077c1da8d0Schandramouli narayanan	mov	E, C
6087c1da8d0Schandramouli narayanan	mov	D, B
6097c1da8d0Schandramouli narayanan	mov	TA, D
6107c1da8d0Schandramouli narayanan
6117c1da8d0Schandramouli narayanan	REGALLOC
6127c1da8d0Schandramouli narayanan
6137c1da8d0Schandramouli narayanan	xchg	WK_BUF, PRECALC_BUF
6147c1da8d0Schandramouli narayanan
615*94330fbeSArd Biesheuvel	jmp	.L_loop
6167c1da8d0Schandramouli narayanan
6177c1da8d0Schandramouli narayanan	.align 32
618*94330fbeSArd Biesheuvel.L_end:
6197c1da8d0Schandramouli narayanan
6207c1da8d0Schandramouli narayanan.endm
6217c1da8d0Schandramouli narayanan/*
6227c1da8d0Schandramouli narayanan * macro implements SHA-1 function's body for several 64-byte blocks
6237c1da8d0Schandramouli narayanan * param: function's name
6247c1da8d0Schandramouli narayanan */
6257c1da8d0Schandramouli narayanan.macro SHA1_VECTOR_ASM  name
6266dcc5627SJiri Slaby	SYM_FUNC_START(\name)
6277c1da8d0Schandramouli narayanan
6287c1da8d0Schandramouli narayanan	push	%rbx
6297c1da8d0Schandramouli narayanan	push	%r12
6307c1da8d0Schandramouli narayanan	push	%r13
6317c1da8d0Schandramouli narayanan	push	%r14
6327c1da8d0Schandramouli narayanan	push	%r15
6337c1da8d0Schandramouli narayanan
6347c1da8d0Schandramouli narayanan	RESERVE_STACK  = (W_SIZE*4 + 8+24)
6357c1da8d0Schandramouli narayanan
6367c1da8d0Schandramouli narayanan	/* Align stack */
63720114c89SJosh Poimboeuf	push	%rbp
63820114c89SJosh Poimboeuf	mov	%rsp, %rbp
6396c8c17ccSMathias Krause	and	$~(0x20-1), %rsp
6407c1da8d0Schandramouli narayanan	sub	$RESERVE_STACK, %rsp
6417c1da8d0Schandramouli narayanan
6427c1da8d0Schandramouli narayanan	avx2_zeroupper
6437c1da8d0Schandramouli narayanan
6448861249cSmegha.dey@linux.intel.com	/* Setup initial values */
6457c1da8d0Schandramouli narayanan	mov	CTX, HASH_PTR
6467c1da8d0Schandramouli narayanan	mov	BUF, BUFFER_PTR
6477c1da8d0Schandramouli narayanan
6488861249cSmegha.dey@linux.intel.com	mov	BUF, BUFFER_PTR2
6498861249cSmegha.dey@linux.intel.com	mov	CNT, BLOCKS_CTR
6507c1da8d0Schandramouli narayanan
6517c1da8d0Schandramouli narayanan	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
6527c1da8d0Schandramouli narayanan
6537c1da8d0Schandramouli narayanan	SHA1_PIPELINED_MAIN_BODY
6547c1da8d0Schandramouli narayanan
6557c1da8d0Schandramouli narayanan	avx2_zeroupper
6567c1da8d0Schandramouli narayanan
65720114c89SJosh Poimboeuf	mov	%rbp, %rsp
65820114c89SJosh Poimboeuf	pop	%rbp
6597c1da8d0Schandramouli narayanan
6607c1da8d0Schandramouli narayanan	pop	%r15
6617c1da8d0Schandramouli narayanan	pop	%r14
6627c1da8d0Schandramouli narayanan	pop	%r13
6637c1da8d0Schandramouli narayanan	pop	%r12
6647c1da8d0Schandramouli narayanan	pop	%rbx
6657c1da8d0Schandramouli narayanan
666f94909ceSPeter Zijlstra	RET
6677c1da8d0Schandramouli narayanan
6686dcc5627SJiri Slaby	SYM_FUNC_END(\name)
6697c1da8d0Schandramouli narayanan.endm
6707c1da8d0Schandramouli narayanan
6717c1da8d0Schandramouli narayanan.section .rodata
6727c1da8d0Schandramouli narayanan
6737c1da8d0Schandramouli narayanan#define K1 0x5a827999
6747c1da8d0Schandramouli narayanan#define K2 0x6ed9eba1
6757c1da8d0Schandramouli narayanan#define K3 0x8f1bbcdc
6767c1da8d0Schandramouli narayanan#define K4 0xca62c1d6
6777c1da8d0Schandramouli narayanan
6787c1da8d0Schandramouli narayanan.align 128
6797c1da8d0Schandramouli narayananK_XMM_AR:
6807c1da8d0Schandramouli narayanan	.long K1, K1, K1, K1
6817c1da8d0Schandramouli narayanan	.long K1, K1, K1, K1
6827c1da8d0Schandramouli narayanan	.long K2, K2, K2, K2
6837c1da8d0Schandramouli narayanan	.long K2, K2, K2, K2
6847c1da8d0Schandramouli narayanan	.long K3, K3, K3, K3
6857c1da8d0Schandramouli narayanan	.long K3, K3, K3, K3
6867c1da8d0Schandramouli narayanan	.long K4, K4, K4, K4
6877c1da8d0Schandramouli narayanan	.long K4, K4, K4, K4
6887c1da8d0Schandramouli narayanan
6897c1da8d0Schandramouli narayananBSWAP_SHUFB_CTL:
6907c1da8d0Schandramouli narayanan	.long 0x00010203
6917c1da8d0Schandramouli narayanan	.long 0x04050607
6927c1da8d0Schandramouli narayanan	.long 0x08090a0b
6937c1da8d0Schandramouli narayanan	.long 0x0c0d0e0f
6947c1da8d0Schandramouli narayanan	.long 0x00010203
6957c1da8d0Schandramouli narayanan	.long 0x04050607
6967c1da8d0Schandramouli narayanan	.long 0x08090a0b
6977c1da8d0Schandramouli narayanan	.long 0x0c0d0e0f
6987c1da8d0Schandramouli narayanan.text
6997c1da8d0Schandramouli narayanan
7007c1da8d0Schandramouli narayananSHA1_VECTOR_ASM     sha1_transform_avx2
701