15663535bSTim Chen########################################################################
25663535bSTim Chen# Implement fast SHA-512 with AVX2 instructions. (x86_64)
35663535bSTim Chen#
45663535bSTim Chen# Copyright (C) 2013 Intel Corporation.
55663535bSTim Chen#
65663535bSTim Chen# Authors:
75663535bSTim Chen#     James Guilford <james.guilford@intel.com>
85663535bSTim Chen#     Kirk Yap <kirk.s.yap@intel.com>
95663535bSTim Chen#     David Cote <david.m.cote@intel.com>
105663535bSTim Chen#     Tim Chen <tim.c.chen@linux.intel.com>
115663535bSTim Chen#
125663535bSTim Chen# This software is available to you under a choice of one of two
135663535bSTim Chen# licenses.  You may choose to be licensed under the terms of the GNU
145663535bSTim Chen# General Public License (GPL) Version 2, available from the file
155663535bSTim Chen# COPYING in the main directory of this source tree, or the
165663535bSTim Chen# OpenIB.org BSD license below:
175663535bSTim Chen#
185663535bSTim Chen#     Redistribution and use in source and binary forms, with or
195663535bSTim Chen#     without modification, are permitted provided that the following
205663535bSTim Chen#     conditions are met:
215663535bSTim Chen#
225663535bSTim Chen#      - Redistributions of source code must retain the above
235663535bSTim Chen#        copyright notice, this list of conditions and the following
245663535bSTim Chen#        disclaimer.
255663535bSTim Chen#
265663535bSTim Chen#      - Redistributions in binary form must reproduce the above
275663535bSTim Chen#        copyright notice, this list of conditions and the following
285663535bSTim Chen#        disclaimer in the documentation and/or other materials
295663535bSTim Chen#        provided with the distribution.
305663535bSTim Chen#
315663535bSTim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
325663535bSTim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
335663535bSTim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
345663535bSTim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
355663535bSTim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
365663535bSTim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
375663535bSTim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
385663535bSTim Chen# SOFTWARE.
395663535bSTim Chen#
405663535bSTim Chen########################################################################
415663535bSTim Chen#
425663535bSTim Chen# This code is described in an Intel White-Paper:
435663535bSTim Chen# "Fast SHA-512 Implementations on Intel Architecture Processors"
445663535bSTim Chen#
455663535bSTim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded
465663535bSTim Chen# and search for that title.
475663535bSTim Chen#
485663535bSTim Chen########################################################################
495663535bSTim Chen# This code schedules 1 blocks at a time, with 4 lanes per block
505663535bSTim Chen########################################################################
515663535bSTim Chen
525663535bSTim Chen#include <linux/linkage.h>
535663535bSTim Chen
545663535bSTim Chen.text
555663535bSTim Chen
565663535bSTim Chen# Virtual Registers
575663535bSTim ChenY_0 = %ymm4
585663535bSTim ChenY_1 = %ymm5
595663535bSTim ChenY_2 = %ymm6
605663535bSTim ChenY_3 = %ymm7
615663535bSTim Chen
625663535bSTim ChenYTMP0 = %ymm0
635663535bSTim ChenYTMP1 = %ymm1
645663535bSTim ChenYTMP2 = %ymm2
655663535bSTim ChenYTMP3 = %ymm3
665663535bSTim ChenYTMP4 = %ymm8
675663535bSTim ChenXFER  = YTMP0
685663535bSTim Chen
695663535bSTim ChenBYTE_FLIP_MASK  = %ymm9
705663535bSTim Chen
71ca04c823SJosh Poimboeuf# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
72ca04c823SJosh PoimboeufCTX1        = %rdi
73ca04c823SJosh PoimboeufCTX2        = %r12
745663535bSTim Chen# 2nd arg
75e68410ebSArd BiesheuvelINP         = %rsi
765663535bSTim Chen# 3rd arg
775663535bSTim ChenNUM_BLKS    = %rdx
785663535bSTim Chen
795663535bSTim Chenc           = %rcx
805663535bSTim Chend           = %r8
815663535bSTim Chene           = %rdx
8200425bb1SArd Biesheuvely3          = %rsi
835663535bSTim Chen
84ca04c823SJosh PoimboeufTBL   = %rdi # clobbers CTX1
855663535bSTim Chen
865663535bSTim Chena     = %rax
875663535bSTim Chenb     = %rbx
885663535bSTim Chen
895663535bSTim Chenf     = %r9
905663535bSTim Cheng     = %r10
915663535bSTim Chenh     = %r11
925663535bSTim Chenold_h = %r11
935663535bSTim Chen
94ca04c823SJosh PoimboeufT1    = %r12 # clobbers CTX2
955663535bSTim Cheny0    = %r13
965663535bSTim Cheny1    = %r14
975663535bSTim Cheny2    = %r15
985663535bSTim Chen
995663535bSTim Chen# Local variables (stack frame)
1005663535bSTim ChenXFER_SIZE = 4*8
1015663535bSTim ChenSRND_SIZE = 1*8
1025663535bSTim ChenINP_SIZE = 1*8
1035663535bSTim ChenINPEND_SIZE = 1*8
104ca04c823SJosh PoimboeufCTX_SIZE = 1*8
1055663535bSTim Chen
1065663535bSTim Chenframe_XFER = 0
1075663535bSTim Chenframe_SRND = frame_XFER + XFER_SIZE
1085663535bSTim Chenframe_INP = frame_SRND + SRND_SIZE
1095663535bSTim Chenframe_INPEND = frame_INP + INP_SIZE
110ca04c823SJosh Poimboeufframe_CTX = frame_INPEND + INPEND_SIZE
111*ec063e09SJosh Poimboeufframe_size = frame_CTX + CTX_SIZE
1125663535bSTim Chen
1135663535bSTim Chen## assume buffers not aligned
1145663535bSTim Chen#define	VMOVDQ vmovdqu
1155663535bSTim Chen
1165663535bSTim Chen# addm [mem], reg
1175663535bSTim Chen# Add reg to mem using reg-mem add and store
1185663535bSTim Chen.macro addm p1 p2
1195663535bSTim Chen	add	\p1, \p2
1205663535bSTim Chen	mov	\p2, \p1
1215663535bSTim Chen.endm
1225663535bSTim Chen
1235663535bSTim Chen
1245663535bSTim Chen# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
1255663535bSTim Chen# Load ymm with mem and byte swap each dword
1265663535bSTim Chen.macro COPY_YMM_AND_BSWAP p1 p2 p3
1275663535bSTim Chen	VMOVDQ \p2, \p1
1285663535bSTim Chen	vpshufb \p3, \p1, \p1
1295663535bSTim Chen.endm
1305663535bSTim Chen# rotate_Ys
1315663535bSTim Chen# Rotate values of symbols Y0...Y3
1325663535bSTim Chen.macro rotate_Ys
1335663535bSTim Chen	Y_ = Y_0
1345663535bSTim Chen	Y_0 = Y_1
1355663535bSTim Chen	Y_1 = Y_2
1365663535bSTim Chen	Y_2 = Y_3
1375663535bSTim Chen	Y_3 = Y_
1385663535bSTim Chen.endm
1395663535bSTim Chen
1405663535bSTim Chen# RotateState
1415663535bSTim Chen.macro RotateState
1425663535bSTim Chen	# Rotate symbols a..h right
1435663535bSTim Chen	old_h  = h
1445663535bSTim Chen	TMP_   = h
1455663535bSTim Chen	h      = g
1465663535bSTim Chen	g      = f
1475663535bSTim Chen	f      = e
1485663535bSTim Chen	e      = d
1495663535bSTim Chen	d      = c
1505663535bSTim Chen	c      = b
1515663535bSTim Chen	b      = a
1525663535bSTim Chen	a      = TMP_
1535663535bSTim Chen.endm
1545663535bSTim Chen
1555663535bSTim Chen# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
1565663535bSTim Chen# YDST = {YSRC1, YSRC2} >> RVAL*8
1575663535bSTim Chen.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
1585663535bSTim Chen	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
1595663535bSTim Chen	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
1605663535bSTim Chen.endm
1615663535bSTim Chen
1625663535bSTim Chen.macro FOUR_ROUNDS_AND_SCHED
1635663535bSTim Chen################################### RND N + 0 #########################################
1645663535bSTim Chen
1655663535bSTim Chen	# Extract w[t-7]
1665663535bSTim Chen	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
1675663535bSTim Chen	# Calculate w[t-16] + w[t-7]
1685663535bSTim Chen	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
1695663535bSTim Chen	# Extract w[t-15]
1705663535bSTim Chen	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
1715663535bSTim Chen
1725663535bSTim Chen	# Calculate sigma0
1735663535bSTim Chen
1745663535bSTim Chen	# Calculate w[t-15] ror 1
1755663535bSTim Chen	vpsrlq		$1, YTMP1, YTMP2
1765663535bSTim Chen	vpsllq		$(64-1), YTMP1, YTMP3
1775663535bSTim Chen	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
1785663535bSTim Chen	# Calculate w[t-15] shr 7
1795663535bSTim Chen	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
1805663535bSTim Chen
1815663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
1825663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
1835663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
1845663535bSTim Chen	add	frame_XFER(%rsp),h		# h = k + w + h         # --
1855663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
1865663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
1875663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
1885663535bSTim Chen
1895663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
1905663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
1915663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
1925663535bSTim Chen
1935663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
1945663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
1955663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
1965663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
1975663535bSTim Chen
1985663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
1995663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
2005663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
2015663535bSTim Chen
2025663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
2035663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
2045663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
2055663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
2065663535bSTim Chen
2075663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
2085663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
2095663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
2105663535bSTim Chen
2115663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
2125663535bSTim Chen
2135663535bSTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
2145663535bSTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
2155663535bSTim Chen
2165663535bSTim Chen	RotateState
2175663535bSTim Chen
2185663535bSTim Chen################################### RND N + 1 #########################################
2195663535bSTim Chen
2205663535bSTim Chen	# Calculate w[t-15] ror 8
2215663535bSTim Chen	vpsrlq		$8, YTMP1, YTMP2
2225663535bSTim Chen	vpsllq		$(64-8), YTMP1, YTMP1
2235663535bSTim Chen	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
2245663535bSTim Chen	# XOR the three components
2255663535bSTim Chen	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
2265663535bSTim Chen	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
2275663535bSTim Chen
2285663535bSTim Chen
2295663535bSTim Chen	# Add three components, w[t-16], w[t-7] and sigma0
2305663535bSTim Chen	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
2315663535bSTim Chen	# Move to appropriate lanes for calculating w[16] and w[17]
2325663535bSTim Chen	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
2335663535bSTim Chen	# Move to appropriate lanes for calculating w[18] and w[19]
2345663535bSTim Chen	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
2355663535bSTim Chen
2365663535bSTim Chen	# Calculate w[16] and w[17] in both 128 bit lanes
2375663535bSTim Chen
2385663535bSTim Chen	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
2395663535bSTim Chen	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
2405663535bSTim Chen	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
2415663535bSTim Chen
2425663535bSTim Chen
2435663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
2445663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
2455663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
2465663535bSTim Chen	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
2475663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
2485663535bSTim Chen
2495663535bSTim Chen
2505663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
2515663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
2525663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
2535663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
2545663535bSTim Chen
2555663535bSTim Chen
2565663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
2575663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
2585663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
2595663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
2605663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
2615663535bSTim Chen
2625663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
2635663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
2645663535bSTim Chen
2655663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
2665663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
2675663535bSTim Chen
2685663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
2695663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
2705663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
2715663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
2725663535bSTim Chen
2735663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
2745663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
2755663535bSTim Chen
2765663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
2775663535bSTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
2785663535bSTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
2795663535bSTim Chen
2805663535bSTim Chen	RotateState
2815663535bSTim Chen
2825663535bSTim Chen
2835663535bSTim Chen################################### RND N + 2 #########################################
2845663535bSTim Chen
2855663535bSTim Chen	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
2865663535bSTim Chen	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
2875663535bSTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
2885663535bSTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
2895663535bSTim Chen	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
2905663535bSTim Chen	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
2915663535bSTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
2925663535bSTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
2935663535bSTim Chen							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
2945663535bSTim Chen
2955663535bSTim Chen	# Add sigma1 to the other compunents to get w[16] and w[17]
2965663535bSTim Chen	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
2975663535bSTim Chen
2985663535bSTim Chen	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
2995663535bSTim Chen	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
3005663535bSTim Chen
3015663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
3025663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
3035663535bSTim Chen	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
3045663535bSTim Chen
3055663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
3065663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
3075663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
3085663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
3095663535bSTim Chen
3105663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
3115663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
3125663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
3135663535bSTim Chen
3145663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
3155663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
3165663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
3175663535bSTim Chen
3185663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
3195663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
3205663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
3215663535bSTim Chen
3225663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
3235663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
3245663535bSTim Chen
3255663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
3265663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
3275663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
3285663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
3295663535bSTim Chen
3305663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
3315663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
3325663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
3335663535bSTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
3345663535bSTim Chen
3355663535bSTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
3365663535bSTim Chen
3375663535bSTim Chen	RotateState
3385663535bSTim Chen
3395663535bSTim Chen################################### RND N + 3 #########################################
3405663535bSTim Chen
3415663535bSTim Chen	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
3425663535bSTim Chen	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
3435663535bSTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
3445663535bSTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
3455663535bSTim Chen	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
3465663535bSTim Chen	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
3475663535bSTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
3485663535bSTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
3495663535bSTim Chen							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
3505663535bSTim Chen
3515663535bSTim Chen	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
3525663535bSTim Chen	# to newly calculated sigma1 to get w[18] and w[19]
3535663535bSTim Chen	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
3545663535bSTim Chen
3555663535bSTim Chen	# Form w[19, w[18], w17], w[16]
3565663535bSTim Chen	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
3575663535bSTim Chen
3585663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
3595663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
3605663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
3615663535bSTim Chen	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
3625663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
3635663535bSTim Chen
3645663535bSTim Chen
3655663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
3665663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
3675663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
3685663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
3695663535bSTim Chen
3705663535bSTim Chen
3715663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
3725663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
3735663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
3745663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
3755663535bSTim Chen
3765663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
3775663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
3785663535bSTim Chen
3795663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
3805663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
3815663535bSTim Chen
3825663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
3835663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
3845663535bSTim Chen
3855663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
3865663535bSTim Chen
3875663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
3885663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
3895663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
3905663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
3915663535bSTim Chen
3925663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
3935663535bSTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
3945663535bSTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
3955663535bSTim Chen
3965663535bSTim Chen	RotateState
3975663535bSTim Chen
3985663535bSTim Chen	rotate_Ys
3995663535bSTim Chen.endm
4005663535bSTim Chen
4015663535bSTim Chen.macro DO_4ROUNDS
4025663535bSTim Chen
4035663535bSTim Chen################################### RND N + 0 #########################################
4045663535bSTim Chen
4055663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
4065663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
4075663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
4085663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
4095663535bSTim Chen
4105663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
4115663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
4125663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
4135663535bSTim Chen
4145663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
4155663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
4165663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
4175663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
4185663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
4195663535bSTim Chen
4205663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
4215663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
4225663535bSTim Chen	add	frame_XFER(%rsp), h		# h = k + w + h         # --
4235663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
4245663535bSTim Chen
4255663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
4265663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
4275663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
4285663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
4295663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
4305663535bSTim Chen
4315663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
4325663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
4335663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
4345663535bSTim Chen
4355663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
4365663535bSTim Chen
4375663535bSTim Chen	RotateState
4385663535bSTim Chen
4395663535bSTim Chen################################### RND N + 1 #########################################
4405663535bSTim Chen
4415663535bSTim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
4425663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
4435663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
4445663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
4455663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
4465663535bSTim Chen
4475663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
4485663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
4495663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
4505663535bSTim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
4515663535bSTim Chen
4525663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
4535663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
4545663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
4555663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
4565663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
4575663535bSTim Chen
4585663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
4595663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
4605663535bSTim Chen	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
4615663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
4625663535bSTim Chen
4635663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
4645663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
4655663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
4665663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
4675663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
4685663535bSTim Chen
4695663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
4705663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
4715663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
4725663535bSTim Chen
4735663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
4745663535bSTim Chen
4755663535bSTim Chen	RotateState
4765663535bSTim Chen
4775663535bSTim Chen################################### RND N + 2 #########################################
4785663535bSTim Chen
4795663535bSTim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
4805663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
4815663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
4825663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
4835663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
4845663535bSTim Chen
4855663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
4865663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
4875663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
4885663535bSTim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
4895663535bSTim Chen
4905663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
4915663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
4925663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
4935663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
4945663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
4955663535bSTim Chen
4965663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
4975663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
4985663535bSTim Chen	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
4995663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
5005663535bSTim Chen
5015663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
5025663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
5035663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
5045663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
5055663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
5065663535bSTim Chen
5075663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
5085663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
5095663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
5105663535bSTim Chen
5115663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
5125663535bSTim Chen
5135663535bSTim Chen	RotateState
5145663535bSTim Chen
5155663535bSTim Chen################################### RND N + 3 #########################################
5165663535bSTim Chen
5175663535bSTim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
5185663535bSTim Chen	mov	f, y2		# y2 = f                                # CH
5195663535bSTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
5205663535bSTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
5215663535bSTim Chen	xor	g, y2		# y2 = f^g                              # CH
5225663535bSTim Chen
5235663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
5245663535bSTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
5255663535bSTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
5265663535bSTim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
5275663535bSTim Chen
5285663535bSTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
5295663535bSTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
5305663535bSTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
5315663535bSTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
5325663535bSTim Chen	mov	a, y3		# y3 = a                                # MAJA
5335663535bSTim Chen
5345663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
5355663535bSTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
5365663535bSTim Chen	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
5375663535bSTim Chen	or	c, y3		# y3 = a|c                              # MAJA
5385663535bSTim Chen
5395663535bSTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
5405663535bSTim Chen	mov	a, T1		# T1 = a                                # MAJB
5415663535bSTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
5425663535bSTim Chen	and	c, T1		# T1 = a&c                              # MAJB
5435663535bSTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
5445663535bSTim Chen
5455663535bSTim Chen
5465663535bSTim Chen	add	h, d		# d = k + w + h + d                     # --
5475663535bSTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
5485663535bSTim Chen	add	y1, h		# h = k + w + h + S0                    # --
5495663535bSTim Chen
5505663535bSTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
5515663535bSTim Chen
5525663535bSTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
5535663535bSTim Chen
5545663535bSTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
5555663535bSTim Chen
5565663535bSTim Chen	RotateState
5575663535bSTim Chen
5585663535bSTim Chen.endm
5595663535bSTim Chen
5605663535bSTim Chen########################################################################
56141419a28SKees Cook# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
56241419a28SKees Cook# Purpose: Updates the SHA512 digest stored at "state" with the message
56341419a28SKees Cook# stored in "data".
56441419a28SKees Cook# The size of the message pointed to by "data" must be an integer multiple
56541419a28SKees Cook# of SHA512 message blocks.
56641419a28SKees Cook# "blocks" is the message length in SHA512 blocks
5675663535bSTim Chen########################################################################
5686dcc5627SJiri SlabySYM_FUNC_START(sha512_transform_rorx)
569*ec063e09SJosh Poimboeuf	# Save GPRs
570*ec063e09SJosh Poimboeuf	push	%rbx
571*ec063e09SJosh Poimboeuf	push	%r12
572*ec063e09SJosh Poimboeuf	push	%r13
573*ec063e09SJosh Poimboeuf	push	%r14
574*ec063e09SJosh Poimboeuf	push	%r15
575*ec063e09SJosh Poimboeuf
5765663535bSTim Chen	# Allocate Stack Space
577*ec063e09SJosh Poimboeuf	push	%rbp
578*ec063e09SJosh Poimboeuf	mov	%rsp, %rbp
5795663535bSTim Chen	sub	$frame_size, %rsp
5805663535bSTim Chen	and	$~(0x20 - 1), %rsp
5815663535bSTim Chen
5825663535bSTim Chen	shl	$7, NUM_BLKS	# convert to bytes
5835663535bSTim Chen	jz	done_hash
5845663535bSTim Chen	add	INP, NUM_BLKS	# pointer to end of data
5855663535bSTim Chen	mov	NUM_BLKS, frame_INPEND(%rsp)
5865663535bSTim Chen
5875663535bSTim Chen	## load initial digest
588ca04c823SJosh Poimboeuf	mov	8*0(CTX1), a
589ca04c823SJosh Poimboeuf	mov	8*1(CTX1), b
590ca04c823SJosh Poimboeuf	mov	8*2(CTX1), c
591ca04c823SJosh Poimboeuf	mov	8*3(CTX1), d
592ca04c823SJosh Poimboeuf	mov	8*4(CTX1), e
593ca04c823SJosh Poimboeuf	mov	8*5(CTX1), f
594ca04c823SJosh Poimboeuf	mov	8*6(CTX1), g
595ca04c823SJosh Poimboeuf	mov	8*7(CTX1), h
596ca04c823SJosh Poimboeuf
597ca04c823SJosh Poimboeuf	# save %rdi (CTX) before it gets clobbered
598ca04c823SJosh Poimboeuf	mov	%rdi, frame_CTX(%rsp)
5995663535bSTim Chen
6005663535bSTim Chen	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
6015663535bSTim Chen
6025663535bSTim Chenloop0:
6035663535bSTim Chen	lea	K512(%rip), TBL
6045663535bSTim Chen
6055663535bSTim Chen	## byte swap first 16 dwords
6065663535bSTim Chen	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
6075663535bSTim Chen	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
6085663535bSTim Chen	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
6095663535bSTim Chen	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
6105663535bSTim Chen
6115663535bSTim Chen	mov	INP, frame_INP(%rsp)
6125663535bSTim Chen
6135663535bSTim Chen	## schedule 64 input dwords, by doing 12 rounds of 4 each
6145663535bSTim Chen	movq	$4, frame_SRND(%rsp)
6155663535bSTim Chen
6165663535bSTim Chen.align 16
6175663535bSTim Chenloop1:
6185663535bSTim Chen	vpaddq	(TBL), Y_0, XFER
6195663535bSTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6205663535bSTim Chen	FOUR_ROUNDS_AND_SCHED
6215663535bSTim Chen
6225663535bSTim Chen	vpaddq	1*32(TBL), Y_0, XFER
6235663535bSTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6245663535bSTim Chen	FOUR_ROUNDS_AND_SCHED
6255663535bSTim Chen
6265663535bSTim Chen	vpaddq	2*32(TBL), Y_0, XFER
6275663535bSTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6285663535bSTim Chen	FOUR_ROUNDS_AND_SCHED
6295663535bSTim Chen
6305663535bSTim Chen	vpaddq	3*32(TBL), Y_0, XFER
6315663535bSTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6325663535bSTim Chen	add	$(4*32), TBL
6335663535bSTim Chen	FOUR_ROUNDS_AND_SCHED
6345663535bSTim Chen
6355663535bSTim Chen	subq	$1, frame_SRND(%rsp)
6365663535bSTim Chen	jne	loop1
6375663535bSTim Chen
6385663535bSTim Chen	movq	$2, frame_SRND(%rsp)
6395663535bSTim Chenloop2:
6405663535bSTim Chen	vpaddq	(TBL), Y_0, XFER
6415663535bSTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6425663535bSTim Chen	DO_4ROUNDS
6435663535bSTim Chen	vpaddq	1*32(TBL), Y_1, XFER
6445663535bSTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6455663535bSTim Chen	add	$(2*32), TBL
6465663535bSTim Chen	DO_4ROUNDS
6475663535bSTim Chen
6485663535bSTim Chen	vmovdqa	Y_2, Y_0
6495663535bSTim Chen	vmovdqa	Y_3, Y_1
6505663535bSTim Chen
6515663535bSTim Chen	subq	$1, frame_SRND(%rsp)
6525663535bSTim Chen	jne	loop2
6535663535bSTim Chen
654ca04c823SJosh Poimboeuf	mov	frame_CTX(%rsp), CTX2
655ca04c823SJosh Poimboeuf	addm	8*0(CTX2), a
656ca04c823SJosh Poimboeuf	addm	8*1(CTX2), b
657ca04c823SJosh Poimboeuf	addm	8*2(CTX2), c
658ca04c823SJosh Poimboeuf	addm	8*3(CTX2), d
659ca04c823SJosh Poimboeuf	addm	8*4(CTX2), e
660ca04c823SJosh Poimboeuf	addm	8*5(CTX2), f
661ca04c823SJosh Poimboeuf	addm	8*6(CTX2), g
662ca04c823SJosh Poimboeuf	addm	8*7(CTX2), h
6635663535bSTim Chen
6645663535bSTim Chen	mov	frame_INP(%rsp), INP
6655663535bSTim Chen	add	$128, INP
6665663535bSTim Chen	cmp	frame_INPEND(%rsp), INP
6675663535bSTim Chen	jne	loop0
6685663535bSTim Chen
6695663535bSTim Chendone_hash:
6705663535bSTim Chen
6715663535bSTim Chen	# Restore Stack Pointer
672*ec063e09SJosh Poimboeuf	mov	%rbp, %rsp
673*ec063e09SJosh Poimboeuf	pop	%rbp
674*ec063e09SJosh Poimboeuf
675*ec063e09SJosh Poimboeuf	# Restore GPRs
676*ec063e09SJosh Poimboeuf	pop	%r15
677*ec063e09SJosh Poimboeuf	pop	%r14
678*ec063e09SJosh Poimboeuf	pop	%r13
679*ec063e09SJosh Poimboeuf	pop	%r12
680*ec063e09SJosh Poimboeuf	pop	%rbx
681*ec063e09SJosh Poimboeuf
6825663535bSTim Chen	ret
6836dcc5627SJiri SlabySYM_FUNC_END(sha512_transform_rorx)
6845663535bSTim Chen
6855663535bSTim Chen########################################################################
6865663535bSTim Chen### Binary Data
6875663535bSTim Chen
6885663535bSTim Chen
689e183914aSDenys Vlasenko# Mergeable 640-byte rodata section. This allows linker to merge the table
690e183914aSDenys Vlasenko# with other, exactly the same 640-byte fragment of another rodata section
691e183914aSDenys Vlasenko# (if such section exists).
692e183914aSDenys Vlasenko.section	.rodata.cst640.K512, "aM", @progbits, 640
6935663535bSTim Chen.align 64
6945663535bSTim Chen# K[t] used in SHA512 hashing
6955663535bSTim ChenK512:
6965663535bSTim Chen	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
6975663535bSTim Chen	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
6985663535bSTim Chen	.quad	0x3956c25bf348b538,0x59f111f1b605d019
6995663535bSTim Chen	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
7005663535bSTim Chen	.quad	0xd807aa98a3030242,0x12835b0145706fbe
7015663535bSTim Chen	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
7025663535bSTim Chen	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
7035663535bSTim Chen	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
7045663535bSTim Chen	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
7055663535bSTim Chen	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
7065663535bSTim Chen	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
7075663535bSTim Chen	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
7085663535bSTim Chen	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
7095663535bSTim Chen	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
7105663535bSTim Chen	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
7115663535bSTim Chen	.quad	0x06ca6351e003826f,0x142929670a0e6e70
7125663535bSTim Chen	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
7135663535bSTim Chen	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
7145663535bSTim Chen	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
7155663535bSTim Chen	.quad	0x81c2c92e47edaee6,0x92722c851482353b
7165663535bSTim Chen	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
7175663535bSTim Chen	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
7185663535bSTim Chen	.quad	0xd192e819d6ef5218,0xd69906245565a910
7195663535bSTim Chen	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
7205663535bSTim Chen	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
7215663535bSTim Chen	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
7225663535bSTim Chen	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
7235663535bSTim Chen	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
7245663535bSTim Chen	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
7255663535bSTim Chen	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
7265663535bSTim Chen	.quad	0x90befffa23631e28,0xa4506cebde82bde9
7275663535bSTim Chen	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
7285663535bSTim Chen	.quad	0xca273eceea26619c,0xd186b8c721c0c207
7295663535bSTim Chen	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
7305663535bSTim Chen	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
7315663535bSTim Chen	.quad	0x113f9804bef90dae,0x1b710b35131c471b
7325663535bSTim Chen	.quad	0x28db77f523047d84,0x32caab7b40c72493
7335663535bSTim Chen	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
7345663535bSTim Chen	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
7355663535bSTim Chen	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
7365663535bSTim Chen
737e183914aSDenys Vlasenko.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
7385663535bSTim Chen.align 32
7395663535bSTim Chen# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
7405663535bSTim ChenPSHUFFLE_BYTE_FLIP_MASK:
7415663535bSTim Chen	.octa 0x08090a0b0c0d0e0f0001020304050607
7425663535bSTim Chen	.octa 0x18191a1b1c1d1e1f1011121314151617
7435663535bSTim Chen
744e183914aSDenys Vlasenko.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
745e183914aSDenys Vlasenko.align 32
7465663535bSTim ChenMASK_YMM_LO:
7475663535bSTim Chen	.octa 0x00000000000000000000000000000000
7485663535bSTim Chen	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
749