xref: /openbmc/linux/arch/x86/crypto/sha512-avx-asm.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1e01d69cbSTim Chen########################################################################
2e01d69cbSTim Chen# Implement fast SHA-512 with AVX instructions. (x86_64)
3e01d69cbSTim Chen#
4e01d69cbSTim Chen# Copyright (C) 2013 Intel Corporation.
5e01d69cbSTim Chen#
6e01d69cbSTim Chen# Authors:
7e01d69cbSTim Chen#     James Guilford <james.guilford@intel.com>
8e01d69cbSTim Chen#     Kirk Yap <kirk.s.yap@intel.com>
9e01d69cbSTim Chen#     David Cote <david.m.cote@intel.com>
10e01d69cbSTim Chen#     Tim Chen <tim.c.chen@linux.intel.com>
11e01d69cbSTim Chen#
12e01d69cbSTim Chen# This software is available to you under a choice of one of two
13e01d69cbSTim Chen# licenses.  You may choose to be licensed under the terms of the GNU
14e01d69cbSTim Chen# General Public License (GPL) Version 2, available from the file
15e01d69cbSTim Chen# COPYING in the main directory of this source tree, or the
16e01d69cbSTim Chen# OpenIB.org BSD license below:
17e01d69cbSTim Chen#
18e01d69cbSTim Chen#     Redistribution and use in source and binary forms, with or
19e01d69cbSTim Chen#     without modification, are permitted provided that the following
20e01d69cbSTim Chen#     conditions are met:
21e01d69cbSTim Chen#
22e01d69cbSTim Chen#      - Redistributions of source code must retain the above
23e01d69cbSTim Chen#        copyright notice, this list of conditions and the following
24e01d69cbSTim Chen#        disclaimer.
25e01d69cbSTim Chen#
26e01d69cbSTim Chen#      - Redistributions in binary form must reproduce the above
27e01d69cbSTim Chen#        copyright notice, this list of conditions and the following
28e01d69cbSTim Chen#        disclaimer in the documentation and/or other materials
29e01d69cbSTim Chen#        provided with the distribution.
30e01d69cbSTim Chen#
31e01d69cbSTim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32e01d69cbSTim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33e01d69cbSTim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34e01d69cbSTim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
35e01d69cbSTim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
36e01d69cbSTim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
37e01d69cbSTim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38e01d69cbSTim Chen# SOFTWARE.
39e01d69cbSTim Chen#
40e01d69cbSTim Chen########################################################################
41e01d69cbSTim Chen#
42e01d69cbSTim Chen# This code is described in an Intel White-Paper:
43e01d69cbSTim Chen# "Fast SHA-512 Implementations on Intel Architecture Processors"
44e01d69cbSTim Chen#
45e01d69cbSTim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded
46e01d69cbSTim Chen# and search for that title.
47e01d69cbSTim Chen#
48e01d69cbSTim Chen########################################################################
49e01d69cbSTim Chen
50e01d69cbSTim Chen#include <linux/linkage.h>
51a1d72fa3SEric Biggers#include <linux/cfi_types.h>
52e01d69cbSTim Chen
53e01d69cbSTim Chen.text
54e01d69cbSTim Chen
55e01d69cbSTim Chen# Virtual Registers
56e01d69cbSTim Chen# ARG1
57e68410ebSArd Biesheuveldigest	= %rdi
58e01d69cbSTim Chen# ARG2
59e68410ebSArd Biesheuvelmsg	= %rsi
60e01d69cbSTim Chen# ARG3
61e01d69cbSTim Chenmsglen	= %rdx
62e01d69cbSTim ChenT1	= %rcx
63e01d69cbSTim ChenT2	= %r8
64e01d69cbSTim Chena_64	= %r9
65e01d69cbSTim Chenb_64	= %r10
66e01d69cbSTim Chenc_64	= %r11
67e01d69cbSTim Chend_64	= %r12
68e01d69cbSTim Chene_64	= %r13
69e01d69cbSTim Chenf_64	= %r14
70e01d69cbSTim Cheng_64	= %r15
71e01d69cbSTim Chenh_64	= %rbx
72e01d69cbSTim Chentmp0	= %rax
73e01d69cbSTim Chen
74e01d69cbSTim Chen# Local variables (stack frame)
75e01d69cbSTim Chen
76e01d69cbSTim Chen# Message Schedule
77e01d69cbSTim ChenW_SIZE = 80*8
78e01d69cbSTim Chen# W[t] + K[t] | W[t+1] + K[t+1]
79e01d69cbSTim ChenWK_SIZE = 2*8
80e01d69cbSTim Chen
81e01d69cbSTim Chenframe_W = 0
82e01d69cbSTim Chenframe_WK = frame_W + W_SIZE
83d61684b5SJosh Poimboeufframe_size = frame_WK + WK_SIZE
84e01d69cbSTim Chen
85e01d69cbSTim Chen# Useful QWORD "arrays" for simpler memory references
86e01d69cbSTim Chen# MSG, DIGEST, K_t, W_t are arrays
87e01d69cbSTim Chen# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
88e01d69cbSTim Chen
89e01d69cbSTim Chen# Input message (arg1)
90e01d69cbSTim Chen#define MSG(i)    8*i(msg)
91e01d69cbSTim Chen
92e01d69cbSTim Chen# Output Digest (arg2)
93e01d69cbSTim Chen#define DIGEST(i) 8*i(digest)
94e01d69cbSTim Chen
95e01d69cbSTim Chen# SHA Constants (static mem)
96e01d69cbSTim Chen#define K_t(i)    8*i+K512(%rip)
97e01d69cbSTim Chen
98e01d69cbSTim Chen# Message Schedule (stack frame)
99e01d69cbSTim Chen#define W_t(i)    8*i+frame_W(%rsp)
100e01d69cbSTim Chen
101e01d69cbSTim Chen# W[t]+K[t] (stack frame)
102e01d69cbSTim Chen#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
103e01d69cbSTim Chen
104e01d69cbSTim Chen.macro RotateState
105e01d69cbSTim Chen	# Rotate symbols a..h right
106e01d69cbSTim Chen	TMP   = h_64
107e01d69cbSTim Chen	h_64  = g_64
108e01d69cbSTim Chen	g_64  = f_64
109e01d69cbSTim Chen	f_64  = e_64
110e01d69cbSTim Chen	e_64  = d_64
111e01d69cbSTim Chen	d_64  = c_64
112e01d69cbSTim Chen	c_64  = b_64
113e01d69cbSTim Chen	b_64  = a_64
114e01d69cbSTim Chen	a_64  = TMP
115e01d69cbSTim Chen.endm
116e01d69cbSTim Chen
117e01d69cbSTim Chen.macro RORQ p1 p2
118e01d69cbSTim Chen	# shld is faster than ror on Sandybridge
119e01d69cbSTim Chen	shld	$(64-\p2), \p1, \p1
120e01d69cbSTim Chen.endm
121e01d69cbSTim Chen
122e01d69cbSTim Chen.macro SHA512_Round rnd
123e01d69cbSTim Chen	# Compute Round %%t
124e01d69cbSTim Chen	mov     f_64, T1          # T1 = f
125e01d69cbSTim Chen	mov     e_64, tmp0        # tmp = e
126e01d69cbSTim Chen	xor     g_64, T1          # T1 = f ^ g
127e01d69cbSTim Chen	RORQ    tmp0, 23   # 41    # tmp = e ror 23
128e01d69cbSTim Chen	and     e_64, T1          # T1 = (f ^ g) & e
129e01d69cbSTim Chen	xor     e_64, tmp0        # tmp = (e ror 23) ^ e
130e01d69cbSTim Chen	xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
131e01d69cbSTim Chen	idx = \rnd
132e01d69cbSTim Chen	add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
133e01d69cbSTim Chen	RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
134e01d69cbSTim Chen	xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
135e01d69cbSTim Chen	mov     a_64, T2          # T2 = a
136e01d69cbSTim Chen	add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
137e01d69cbSTim Chen	RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
138e01d69cbSTim Chen	add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
139e01d69cbSTim Chen	mov     a_64, tmp0        # tmp = a
140e01d69cbSTim Chen	xor     c_64, T2          # T2 = a ^ c
141e01d69cbSTim Chen	and     c_64, tmp0        # tmp = a & c
142e01d69cbSTim Chen	and     b_64, T2          # T2 = (a ^ c) & b
143e01d69cbSTim Chen	xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
144e01d69cbSTim Chen	mov     a_64, tmp0        # tmp = a
145e01d69cbSTim Chen	RORQ    tmp0, 5  # 39     # tmp = a ror 5
146e01d69cbSTim Chen	xor     a_64, tmp0        # tmp = (a ror 5) ^ a
147e01d69cbSTim Chen	add     T1, d_64          # e(next_state) = d + T1
148e01d69cbSTim Chen	RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
149e01d69cbSTim Chen	xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
150e01d69cbSTim Chen	lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
151e01d69cbSTim Chen	RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
152e01d69cbSTim Chen	add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
153e01d69cbSTim Chen	RotateState
154e01d69cbSTim Chen.endm
155e01d69cbSTim Chen
156e01d69cbSTim Chen.macro SHA512_2Sched_2Round_avx rnd
157e01d69cbSTim Chen	# Compute rounds t-2 and t-1
158e01d69cbSTim Chen	# Compute message schedule QWORDS t and t+1
159e01d69cbSTim Chen
160e01d69cbSTim Chen	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
161e01d69cbSTim Chen	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
162e01d69cbSTim Chen	# scheduler.
163e01d69cbSTim Chen	#   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
164e01d69cbSTim Chen	# They are then added to their respective SHA512 constants at
165e01d69cbSTim Chen	# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
166e01d69cbSTim Chen	#   For brievity, the comments following vectored instructions only refer to
167e01d69cbSTim Chen	# the first of a pair of QWORDS.
168e01d69cbSTim Chen	# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
169e01d69cbSTim Chen	#   The computation of the message schedule and the rounds are tightly
170e01d69cbSTim Chen	# stitched to take advantage of instruction-level parallelism.
171e01d69cbSTim Chen
172e01d69cbSTim Chen	idx = \rnd - 2
173e01d69cbSTim Chen	vmovdqa	W_t(idx), %xmm4		# XMM4 = W[t-2]
174e01d69cbSTim Chen	idx = \rnd - 15
175e01d69cbSTim Chen	vmovdqu	W_t(idx), %xmm5		# XMM5 = W[t-15]
176e01d69cbSTim Chen	mov	f_64, T1
177e01d69cbSTim Chen	vpsrlq	$61, %xmm4, %xmm0	# XMM0 = W[t-2]>>61
178e01d69cbSTim Chen	mov	e_64, tmp0
179e01d69cbSTim Chen	vpsrlq	$1, %xmm5, %xmm6	# XMM6 = W[t-15]>>1
180e01d69cbSTim Chen	xor	g_64, T1
181e01d69cbSTim Chen	RORQ	tmp0, 23 # 41
182e01d69cbSTim Chen	vpsrlq	$19, %xmm4, %xmm1	# XMM1 = W[t-2]>>19
183e01d69cbSTim Chen	and	e_64, T1
184e01d69cbSTim Chen	xor	e_64, tmp0
185e01d69cbSTim Chen	vpxor	%xmm1, %xmm0, %xmm0	# XMM0 = W[t-2]>>61 ^ W[t-2]>>19
186e01d69cbSTim Chen	xor	g_64, T1
187e01d69cbSTim Chen	idx = \rnd
188e01d69cbSTim Chen	add	WK_2(idx), T1#
189e01d69cbSTim Chen	vpsrlq	$8, %xmm5, %xmm7	# XMM7 = W[t-15]>>8
190e01d69cbSTim Chen	RORQ	tmp0, 4 # 18
191e01d69cbSTim Chen	vpsrlq	$6, %xmm4, %xmm2	# XMM2 = W[t-2]>>6
192e01d69cbSTim Chen	xor	e_64, tmp0
193e01d69cbSTim Chen	mov	a_64, T2
194e01d69cbSTim Chen	add	h_64, T1
195e01d69cbSTim Chen	vpxor	%xmm7, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8
196e01d69cbSTim Chen	RORQ	tmp0, 14 # 14
197e01d69cbSTim Chen	add	tmp0, T1
198e01d69cbSTim Chen	vpsrlq	$7, %xmm5, %xmm8	# XMM8 = W[t-15]>>7
199e01d69cbSTim Chen	mov	a_64, tmp0
200e01d69cbSTim Chen	xor	c_64, T2
201e01d69cbSTim Chen	vpsllq	$(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
202e01d69cbSTim Chen	and	c_64, tmp0
203e01d69cbSTim Chen	and	b_64, T2
204e01d69cbSTim Chen	vpxor	%xmm3, %xmm2, %xmm2	# XMM2 = W[t-2]>>6 ^ W[t-2]<<3
205e01d69cbSTim Chen	xor	tmp0, T2
206e01d69cbSTim Chen	mov	a_64, tmp0
207e01d69cbSTim Chen	vpsllq	$(64-1), %xmm5, %xmm9	# XMM9 = W[t-15]<<63
208e01d69cbSTim Chen	RORQ	tmp0, 5 # 39
209e01d69cbSTim Chen	vpxor	%xmm9, %xmm8, %xmm8	# XMM8 = W[t-15]>>7 ^ W[t-15]<<63
210e01d69cbSTim Chen	xor	a_64, tmp0
211e01d69cbSTim Chen	add	T1, d_64
212e01d69cbSTim Chen	RORQ	tmp0, 6 # 34
213e01d69cbSTim Chen	xor	a_64, tmp0
214e01d69cbSTim Chen	vpxor	%xmm8, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
215e01d69cbSTim Chen					#  W[t-15]>>7 ^ W[t-15]<<63
216e01d69cbSTim Chen	lea	(T1, T2), h_64
217e01d69cbSTim Chen	RORQ	tmp0, 28 # 28
218e01d69cbSTim Chen	vpsllq	$(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
219e01d69cbSTim Chen	add	tmp0, h_64
220e01d69cbSTim Chen	RotateState
221e01d69cbSTim Chen	vpxor	%xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
222e01d69cbSTim Chen					#        W[t-2]<<25
223e01d69cbSTim Chen	mov	f_64, T1
224e01d69cbSTim Chen	vpxor	%xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
225e01d69cbSTim Chen	mov	e_64, tmp0
226e01d69cbSTim Chen	xor	g_64, T1
227e01d69cbSTim Chen	idx = \rnd - 16
228e01d69cbSTim Chen	vpaddq	W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
229e01d69cbSTim Chen	idx = \rnd - 7
230e01d69cbSTim Chen	vmovdqu	W_t(idx), %xmm1		# XMM1 = W[t-7]
231e01d69cbSTim Chen	RORQ	tmp0, 23 # 41
232e01d69cbSTim Chen	and	e_64, T1
233e01d69cbSTim Chen	xor	e_64, tmp0
234e01d69cbSTim Chen	xor	g_64, T1
235e01d69cbSTim Chen	vpsllq	$(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
236e01d69cbSTim Chen	idx = \rnd + 1
237e01d69cbSTim Chen	add	WK_2(idx), T1
238e01d69cbSTim Chen	vpxor	%xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
239e01d69cbSTim Chen	RORQ	tmp0, 4 # 18
240e01d69cbSTim Chen	vpaddq	%xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
241e01d69cbSTim Chen	xor	e_64, tmp0
242e01d69cbSTim Chen	vpaddq	%xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
243e01d69cbSTim Chen					#               s0(W[t-15]) + W[t-16]
244e01d69cbSTim Chen	mov	a_64, T2
245e01d69cbSTim Chen	add	h_64, T1
246e01d69cbSTim Chen	RORQ	tmp0, 14 # 14
247e01d69cbSTim Chen	add	tmp0, T1
248e01d69cbSTim Chen	idx = \rnd
249e01d69cbSTim Chen	vmovdqa	%xmm0, W_t(idx)		# Store W[t]
250e01d69cbSTim Chen	vpaddq	K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
251e01d69cbSTim Chen	vmovdqa	%xmm0, WK_2(idx)	# Store W[t]+K[t] for next rounds
252e01d69cbSTim Chen	mov	a_64, tmp0
253e01d69cbSTim Chen	xor	c_64, T2
254e01d69cbSTim Chen	and	c_64, tmp0
255e01d69cbSTim Chen	and	b_64, T2
256e01d69cbSTim Chen	xor	tmp0, T2
257e01d69cbSTim Chen	mov	a_64, tmp0
258e01d69cbSTim Chen	RORQ	tmp0, 5 # 39
259e01d69cbSTim Chen	xor	a_64, tmp0
260e01d69cbSTim Chen	add	T1, d_64
261e01d69cbSTim Chen	RORQ	tmp0, 6 # 34
262e01d69cbSTim Chen	xor	a_64, tmp0
263e01d69cbSTim Chen	lea	(T1, T2), h_64
264e01d69cbSTim Chen	RORQ	tmp0, 28 # 28
265e01d69cbSTim Chen	add	tmp0, h_64
266e01d69cbSTim Chen	RotateState
267e01d69cbSTim Chen.endm
268e01d69cbSTim Chen
269e01d69cbSTim Chen########################################################################
27041419a28SKees Cook# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks)
27141419a28SKees Cook# Purpose: Updates the SHA512 digest stored at "state" with the message
27241419a28SKees Cook# stored in "data".
27341419a28SKees Cook# The size of the message pointed to by "data" must be an integer multiple
27441419a28SKees Cook# of SHA512 message blocks.
27541419a28SKees Cook# "blocks" is the message length in SHA512 blocks
276e01d69cbSTim Chen########################################################################
277a1d72fa3SEric BiggersSYM_TYPED_FUNC_START(sha512_transform_avx)
2780b837f1eSUros Bizjak	test msglen, msglen
279*94330fbeSArd Biesheuvel	je .Lnowork
280e01d69cbSTim Chen
281d61684b5SJosh Poimboeuf	# Save GPRs
282d61684b5SJosh Poimboeuf	push	%rbx
283d61684b5SJosh Poimboeuf	push	%r12
284d61684b5SJosh Poimboeuf	push	%r13
285d61684b5SJosh Poimboeuf	push	%r14
286d61684b5SJosh Poimboeuf	push	%r15
287d61684b5SJosh Poimboeuf
288e01d69cbSTim Chen	# Allocate Stack Space
289d61684b5SJosh Poimboeuf	push	%rbp
290d61684b5SJosh Poimboeuf	mov	%rsp, %rbp
291e01d69cbSTim Chen	sub     $frame_size, %rsp
292e01d69cbSTim Chen	and	$~(0x20 - 1), %rsp
293e01d69cbSTim Chen
294*94330fbeSArd Biesheuvel.Lupdateblock:
295e01d69cbSTim Chen
296e01d69cbSTim Chen	# Load state variables
297e01d69cbSTim Chen	mov     DIGEST(0), a_64
298e01d69cbSTim Chen	mov     DIGEST(1), b_64
299e01d69cbSTim Chen	mov     DIGEST(2), c_64
300e01d69cbSTim Chen	mov     DIGEST(3), d_64
301e01d69cbSTim Chen	mov     DIGEST(4), e_64
302e01d69cbSTim Chen	mov     DIGEST(5), f_64
303e01d69cbSTim Chen	mov     DIGEST(6), g_64
304e01d69cbSTim Chen	mov     DIGEST(7), h_64
305e01d69cbSTim Chen
306e01d69cbSTim Chen	t = 0
307e01d69cbSTim Chen	.rept 80/2 + 1
308e01d69cbSTim Chen	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
309e01d69cbSTim Chen	# +1 iteration because the scheduler leads hashing by 1 iteration
310e01d69cbSTim Chen		.if t < 2
311e01d69cbSTim Chen			# BSWAP 2 QWORDS
312e01d69cbSTim Chen			vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
313e01d69cbSTim Chen			vmovdqu  MSG(t), %xmm0
314e01d69cbSTim Chen			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
315e01d69cbSTim Chen			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
316e01d69cbSTim Chen			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
317e01d69cbSTim Chen			vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
318e01d69cbSTim Chen		.elseif t < 16
319e01d69cbSTim Chen			# BSWAP 2 QWORDS# Compute 2 Rounds
320e01d69cbSTim Chen			vmovdqu  MSG(t), %xmm0
321e01d69cbSTim Chen			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
322e01d69cbSTim Chen			SHA512_Round t-2    # Round t-2
323e01d69cbSTim Chen			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
324e01d69cbSTim Chen			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
325e01d69cbSTim Chen			SHA512_Round t-1    # Round t-1
326e01d69cbSTim Chen			vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
327e01d69cbSTim Chen		.elseif t < 79
328e01d69cbSTim Chen			# Schedule 2 QWORDS# Compute 2 Rounds
329e01d69cbSTim Chen			SHA512_2Sched_2Round_avx t
330e01d69cbSTim Chen		.else
331e01d69cbSTim Chen			# Compute 2 Rounds
332e01d69cbSTim Chen			SHA512_Round t-2
333e01d69cbSTim Chen			SHA512_Round t-1
334e01d69cbSTim Chen		.endif
335e01d69cbSTim Chen		t = t+2
336e01d69cbSTim Chen	.endr
337e01d69cbSTim Chen
338e01d69cbSTim Chen	# Update digest
339e01d69cbSTim Chen	add     a_64, DIGEST(0)
340e01d69cbSTim Chen	add     b_64, DIGEST(1)
341e01d69cbSTim Chen	add     c_64, DIGEST(2)
342e01d69cbSTim Chen	add     d_64, DIGEST(3)
343e01d69cbSTim Chen	add     e_64, DIGEST(4)
344e01d69cbSTim Chen	add     f_64, DIGEST(5)
345e01d69cbSTim Chen	add     g_64, DIGEST(6)
346e01d69cbSTim Chen	add     h_64, DIGEST(7)
347e01d69cbSTim Chen
348e01d69cbSTim Chen	# Advance to next message block
349e01d69cbSTim Chen	add     $16*8, msg
350e01d69cbSTim Chen	dec     msglen
351*94330fbeSArd Biesheuvel	jnz     .Lupdateblock
352e01d69cbSTim Chen
353e01d69cbSTim Chen	# Restore Stack Pointer
354d61684b5SJosh Poimboeuf	mov	%rbp, %rsp
355d61684b5SJosh Poimboeuf	pop	%rbp
356d61684b5SJosh Poimboeuf
357d61684b5SJosh Poimboeuf	# Restore GPRs
358d61684b5SJosh Poimboeuf	pop	%r15
359d61684b5SJosh Poimboeuf	pop	%r14
360d61684b5SJosh Poimboeuf	pop	%r13
361d61684b5SJosh Poimboeuf	pop	%r12
362d61684b5SJosh Poimboeuf	pop	%rbx
363e01d69cbSTim Chen
364*94330fbeSArd Biesheuvel.Lnowork:
365f94909ceSPeter Zijlstra	RET
3666dcc5627SJiri SlabySYM_FUNC_END(sha512_transform_avx)
367e01d69cbSTim Chen
368e01d69cbSTim Chen########################################################################
369e01d69cbSTim Chen### Binary Data
370e01d69cbSTim Chen
371e183914aSDenys Vlasenko.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
372e01d69cbSTim Chen.align 16
373e01d69cbSTim Chen# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
374e01d69cbSTim ChenXMM_QWORD_BSWAP:
375e01d69cbSTim Chen	.octa 0x08090a0b0c0d0e0f0001020304050607
376e01d69cbSTim Chen
377e183914aSDenys Vlasenko# Mergeable 640-byte rodata section. This allows linker to merge the table
378e183914aSDenys Vlasenko# with other, exactly the same 640-byte fragment of another rodata section
379e183914aSDenys Vlasenko# (if such section exists).
380e183914aSDenys Vlasenko.section	.rodata.cst640.K512, "aM", @progbits, 640
381e183914aSDenys Vlasenko.align 64
382e01d69cbSTim Chen# K[t] used in SHA512 hashing
383e01d69cbSTim ChenK512:
384e01d69cbSTim Chen	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
385e01d69cbSTim Chen	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
386e01d69cbSTim Chen	.quad 0x3956c25bf348b538,0x59f111f1b605d019
387e01d69cbSTim Chen	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
388e01d69cbSTim Chen	.quad 0xd807aa98a3030242,0x12835b0145706fbe
389e01d69cbSTim Chen	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
390e01d69cbSTim Chen	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
391e01d69cbSTim Chen	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
392e01d69cbSTim Chen	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
393e01d69cbSTim Chen	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
394e01d69cbSTim Chen	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
395e01d69cbSTim Chen	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
396e01d69cbSTim Chen	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
397e01d69cbSTim Chen	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
398e01d69cbSTim Chen	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
399e01d69cbSTim Chen	.quad 0x06ca6351e003826f,0x142929670a0e6e70
400e01d69cbSTim Chen	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
401e01d69cbSTim Chen	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
402e01d69cbSTim Chen	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
403e01d69cbSTim Chen	.quad 0x81c2c92e47edaee6,0x92722c851482353b
404e01d69cbSTim Chen	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
405e01d69cbSTim Chen	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
406e01d69cbSTim Chen	.quad 0xd192e819d6ef5218,0xd69906245565a910
407e01d69cbSTim Chen	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
408e01d69cbSTim Chen	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
409e01d69cbSTim Chen	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
410e01d69cbSTim Chen	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
411e01d69cbSTim Chen	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
412e01d69cbSTim Chen	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
413e01d69cbSTim Chen	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
414e01d69cbSTim Chen	.quad 0x90befffa23631e28,0xa4506cebde82bde9
415e01d69cbSTim Chen	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
416e01d69cbSTim Chen	.quad 0xca273eceea26619c,0xd186b8c721c0c207
417e01d69cbSTim Chen	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
418e01d69cbSTim Chen	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
419e01d69cbSTim Chen	.quad 0x113f9804bef90dae,0x1b710b35131c471b
420e01d69cbSTim Chen	.quad 0x28db77f523047d84,0x32caab7b40c72493
421e01d69cbSTim Chen	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
422e01d69cbSTim Chen	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
423e01d69cbSTim Chen	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
424