1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52#include <linux/cfi_types.h>
53
54## assume buffers not aligned
55#define	VMOVDQ vmovdqu
56
57################################ Define Macros
58
59# addm [mem], reg
60# Add reg to mem using reg-mem add and store
61.macro addm p1 p2
62	add	\p1, \p2
63	mov	\p2, \p1
64.endm
65
66################################
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73# XMM versions of above
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER  = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
88SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93NUM_BLKS = %rdx	# 3rd arg
94INP	= %rsi  # 2nd arg
95CTX	= %rdi	# 1st arg
96c	= %ecx
97d	= %r8d
98e       = %edx	# clobbers NUM_BLKS
99y3	= %esi	# clobbers INP
100
101SRND	= CTX	# SRND is same register as CTX
102
103a = %eax
104b = %ebx
105f = %r9d
106g = %r10d
107h = %r11d
108old_h = %r11d
109
110T1 = %r12d
111y0 = %r13d
112y1 = %r14d
113y2 = %r15d
114
115
116_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
117_XMM_SAVE_SIZE	= 0
118_INP_END_SIZE	= 8
119_INP_SIZE	= 8
120_CTX_SIZE	= 8
121
122_XFER		= 0
123_XMM_SAVE	= _XFER     + _XFER_SIZE
124_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
125_INP		= _INP_END  + _INP_END_SIZE
126_CTX		= _INP      + _INP_SIZE
127STACK_SIZE	= _CTX      + _CTX_SIZE
128
129# rotate_Xs
130# Rotate values of symbols X0...X3
131.macro rotate_Xs
132	X_ = X0
133	X0 = X1
134	X1 = X2
135	X2 = X3
136	X3 = X_
137.endm
138
139# ROTATE_ARGS
140# Rotate values of symbols a...h
141.macro ROTATE_ARGS
142	old_h = h
143	TMP_ = h
144	h = g
145	g = f
146	f = e
147	e = d
148	d = c
149	c = b
150	b = a
151	a = TMP_
152.endm
153
154.macro FOUR_ROUNDS_AND_SCHED disp
155################################### RND N + 0 ############################
156
157	mov	a, y3		# y3 = a                                # MAJA
158	rorx	$25, e, y0	# y0 = e >> 25				# S1A
159	rorx	$11, e, y1	# y1 = e >> 11				# S1B
160
161	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
162	or	c, y3		# y3 = a|c                              # MAJA
163	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164	mov	f, y2		# y2 = f                                # CH
165	rorx	$13, a, T1	# T1 = a >> 13				# S0B
166
167	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
168	xor	g, y2		# y2 = f^g                              # CH
169	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
170	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
171
172	and	e, y2		# y2 = (f^g)&e                          # CH
173	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
174	rorx	$22, a, y1	# y1 = a >> 22				# S0A
175	add	h, d		# d = k + w + h + d                     # --
176
177	and	b, y3		# y3 = (a|c)&b                          # MAJA
178	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
179	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
180	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
181
182	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
183	vpsrld	$7, XTMP1, XTMP2
184	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
185	mov	a, T1		# T1 = a                                # MAJB
186	and	c, T1		# T1 = a&c                              # MAJB
187
188	add	y0, y2		# y2 = S1 + CH                          # --
189	vpslld	$(32-7), XTMP1, XTMP3
190	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
191	add	y1, h		# h = k + w + h + S0                    # --
192
193	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
194	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
195
196	vpsrld	$18, XTMP1, XTMP2
197	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
198	add	y3, h		# h = t1 + S0 + MAJ                     # --
199
200
201	ROTATE_ARGS
202
203################################### RND N + 1 ############################
204
205	mov	a, y3		# y3 = a                                # MAJA
206	rorx	$25, e, y0	# y0 = e >> 25				# S1A
207	rorx	$11, e, y1	# y1 = e >> 11				# S1B
208	offset = \disp + 1*4
209	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
210	or	c, y3		# y3 = a|c                              # MAJA
211
212
213	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214	mov	f, y2		# y2 = f                                # CH
215	rorx	$13, a, T1	# T1 = a >> 13				# S0B
216	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
217	xor	g, y2		# y2 = f^g                              # CH
218
219
220	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
221	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
222	rorx	$22, a, y1	# y1 = a >> 22				# S0A
223	and	e, y2		# y2 = (f^g)&e                          # CH
224	add	h, d		# d = k + w + h + d                     # --
225
226	vpslld	$(32-18), XTMP1, XTMP1
227	and	b, y3		# y3 = (a|c)&b                          # MAJA
228	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
229
230	vpxor	XTMP1, XTMP3, XTMP3
231	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
232	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
233
234	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
236	mov	a, T1		# T1 = a                                # MAJB
237	and	c, T1		# T1 = a&c                              # MAJB
238	add	y0, y2		# y2 = S1 + CH                          # --
239
240	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
241	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
242	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
243	add	y1, h		# h = k + w + h + S0                    # --
244
245	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
246	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
247	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
248	add	y3, h		# h = t1 + S0 + MAJ                     # --
249
250	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
251
252
253	ROTATE_ARGS
254
255################################### RND N + 2 ############################
256
257	mov	a, y3		# y3 = a                                # MAJA
258	rorx	$25, e, y0	# y0 = e >> 25				# S1A
259	offset = \disp + 2*4
260	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
261
262	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
263	rorx	$11, e, y1	# y1 = e >> 11				# S1B
264	or	c, y3		# y3 = a|c                              # MAJA
265	mov	f, y2		# y2 = f                                # CH
266	xor	g, y2		# y2 = f^g                              # CH
267
268	rorx	$13, a, T1	# T1 = a >> 13				# S0B
269	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
270	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
271	and	e, y2		# y2 = (f^g)&e                          # CH
272
273	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
274	vpxor	XTMP3, XTMP2, XTMP2
275	add	h, d		# d = k + w + h + d                     # --
276	and	b, y3		# y3 = (a|c)&b                          # MAJA
277
278	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
279	rorx	$22, a, y1	# y1 = a >> 22				# S0A
280	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
281	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
282
283	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
284	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
285	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
286	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
287
288	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
289	mov	a, T1		# T1 = a                                # MAJB
290	and	c, T1		# T1 = a&c                              # MAJB
291	add	y0, y2		# y2 = S1 + CH                          # --
292	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
293
294	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
295	add	y1,h		# h = k + w + h + S0                    # --
296	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
297	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
298
299	add	y3,h		# h = t1 + S0 + MAJ                     # --
300
301
302	ROTATE_ARGS
303
304################################### RND N + 3 ############################
305
306	mov	a, y3		# y3 = a                                # MAJA
307	rorx	$25, e, y0	# y0 = e >> 25				# S1A
308	rorx	$11, e, y1	# y1 = e >> 11				# S1B
309	offset = \disp + 3*4
310	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
311	or	c, y3		# y3 = a|c                              # MAJA
312
313
314	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
315	mov	f, y2		# y2 = f                                # CH
316	rorx	$13, a, T1	# T1 = a >> 13				# S0B
317	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
318	xor	g, y2		# y2 = f^g                              # CH
319
320
321	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
322	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
323	and	e, y2		# y2 = (f^g)&e                          # CH
324	add	h, d		# d = k + w + h + d                     # --
325	and	b, y3		# y3 = (a|c)&b                          # MAJA
326
327	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
328	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
329	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
330
331	vpxor	XTMP3, XTMP2, XTMP2
332	rorx	$22, a, y1	# y1 = a >> 22				# S0A
333	add	y0, y2		# y2 = S1 + CH                          # --
334
335	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
336	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
337	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
338
339	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
340	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
341
342	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
343	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
344	mov	a, T1		# T1 = a                                # MAJB
345	and	c, T1		# T1 = a&c                              # MAJB
346	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
347
348	add	y1, h		# h = k + w + h + S0                    # --
349	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
350	add	y3, h		# h = t1 + S0 + MAJ                     # --
351
352	ROTATE_ARGS
353	rotate_Xs
354.endm
355
356.macro DO_4ROUNDS disp
357################################### RND N + 0 ###########################
358
359	mov	f, y2		# y2 = f                                # CH
360	rorx	$25, e, y0	# y0 = e >> 25				# S1A
361	rorx	$11, e, y1	# y1 = e >> 11				# S1B
362	xor	g, y2		# y2 = f^g                              # CH
363
364	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
365	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
366	and	e, y2		# y2 = (f^g)&e                          # CH
367
368	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
369	rorx	$13, a, T1	# T1 = a >> 13				# S0B
370	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
371	rorx	$22, a, y1	# y1 = a >> 22				# S0A
372	mov	a, y3		# y3 = a                                # MAJA
373
374	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
375	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
376	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
377	or	c, y3		# y3 = a|c                              # MAJA
378
379	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
380	mov	a, T1		# T1 = a                                # MAJB
381	and	b, y3		# y3 = (a|c)&b                          # MAJA
382	and	c, T1		# T1 = a&c                              # MAJB
383	add	y0, y2		# y2 = S1 + CH                          # --
384
385
386	add	h, d		# d = k + w + h + d                     # --
387	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
388	add	y1, h		# h = k + w + h + S0                    # --
389	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
390
391	ROTATE_ARGS
392
393################################### RND N + 1 ###########################
394
395	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
396	mov	f, y2		# y2 = f                                # CH
397	rorx	$25, e, y0	# y0 = e >> 25				# S1A
398	rorx	$11, e, y1	# y1 = e >> 11				# S1B
399	xor	g, y2		# y2 = f^g                              # CH
400
401	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
402	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
403	and	e, y2		# y2 = (f^g)&e                          # CH
404	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
405
406	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
407	rorx	$13, a, T1	# T1 = a >> 13				# S0B
408	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
409	rorx	$22, a, y1	# y1 = a >> 22				# S0A
410	mov	a, y3		# y3 = a                                # MAJA
411
412	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
413	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
414	offset = 4*1 + \disp
415	addl	offset(%rsp, SRND), h		# h = k + w + h # --
416	or	c, y3		# y3 = a|c                              # MAJA
417
418	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
419	mov	a, T1		# T1 = a                                # MAJB
420	and	b, y3		# y3 = (a|c)&b                          # MAJA
421	and	c, T1		# T1 = a&c                              # MAJB
422	add	y0, y2		# y2 = S1 + CH                          # --
423
424
425	add	h, d		# d = k + w + h + d                     # --
426	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
427	add	y1, h		# h = k + w + h + S0                    # --
428
429	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
430
431	ROTATE_ARGS
432
433################################### RND N + 2 ##############################
434
435	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
436	mov	f, y2		# y2 = f                                # CH
437	rorx	$25, e, y0	# y0 = e >> 25				# S1A
438	rorx	$11, e, y1	# y1 = e >> 11				# S1B
439	xor	g, y2		# y2 = f^g                              # CH
440
441	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
442	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
443	and	e, y2		# y2 = (f^g)&e                          # CH
444	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
445
446	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
447	rorx	$13, a, T1	# T1 = a >> 13				# S0B
448	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
449	rorx	$22, a, y1	# y1 = a >> 22				# S0A
450	mov	a, y3		# y3 = a                                # MAJA
451
452	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
453	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
454	offset = 4*2 + \disp
455	addl	offset(%rsp, SRND), h		# h = k + w + h # --
456	or	c, y3		# y3 = a|c                              # MAJA
457
458	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
459	mov	a, T1		# T1 = a                                # MAJB
460	and	b, y3		# y3 = (a|c)&b                          # MAJA
461	and	c, T1		# T1 = a&c                              # MAJB
462	add	y0, y2		# y2 = S1 + CH                          # --
463
464
465	add	h, d		# d = k + w + h + d                     # --
466	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
467	add	y1, h		# h = k + w + h + S0                    # --
468
469	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
470
471	ROTATE_ARGS
472
473################################### RND N + 3 ###########################
474
475	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
476	mov	f, y2		# y2 = f                                # CH
477	rorx	$25, e, y0	# y0 = e >> 25				# S1A
478	rorx	$11, e, y1	# y1 = e >> 11				# S1B
479	xor	g, y2		# y2 = f^g                              # CH
480
481	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
482	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
483	and	e, y2		# y2 = (f^g)&e                          # CH
484	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
485
486	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
487	rorx	$13, a, T1	# T1 = a >> 13				# S0B
488	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
489	rorx	$22, a, y1	# y1 = a >> 22				# S0A
490	mov	a, y3		# y3 = a                                # MAJA
491
492	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
493	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
494	offset = 4*3 + \disp
495	addl	offset(%rsp, SRND), h		# h = k + w + h # --
496	or	c, y3		# y3 = a|c                              # MAJA
497
498	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
499	mov	a, T1		# T1 = a                                # MAJB
500	and	b, y3		# y3 = (a|c)&b                          # MAJA
501	and	c, T1		# T1 = a&c                              # MAJB
502	add	y0, y2		# y2 = S1 + CH                          # --
503
504
505	add	h, d		# d = k + w + h + d                     # --
506	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
507	add	y1, h		# h = k + w + h + S0                    # --
508
509	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
510
511
512	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
513
514	add	y3, h		# h = t1 + S0 + MAJ                     # --
515
516	ROTATE_ARGS
517
518.endm
519
520########################################################################
521## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
522## arg 1 : pointer to state
523## arg 2 : pointer to input data
524## arg 3 : Num blocks
525########################################################################
526.text
527SYM_TYPED_FUNC_START(sha256_transform_rorx)
528	pushq	%rbx
529	pushq	%r12
530	pushq	%r13
531	pushq	%r14
532	pushq	%r15
533
534	push	%rbp
535	mov	%rsp, %rbp
536
537	subq	$STACK_SIZE, %rsp
538	and	$-32, %rsp	# align rsp to 32 byte boundary
539
540	shl	$6, NUM_BLKS	# convert to bytes
541	jz	done_hash
542	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
543	mov	NUM_BLKS, _INP_END(%rsp)
544
545	cmp	NUM_BLKS, INP
546	je	only_one_block
547
548	## load initial digest
549	mov	(CTX), a
550	mov	4*1(CTX), b
551	mov	4*2(CTX), c
552	mov	4*3(CTX), d
553	mov	4*4(CTX), e
554	mov	4*5(CTX), f
555	mov	4*6(CTX), g
556	mov	4*7(CTX), h
557
558	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
560	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
561
562	mov	CTX, _CTX(%rsp)
563
564loop0:
565	## Load first 16 dwords from two blocks
566	VMOVDQ	0*32(INP),XTMP0
567	VMOVDQ	1*32(INP),XTMP1
568	VMOVDQ	2*32(INP),XTMP2
569	VMOVDQ	3*32(INP),XTMP3
570
571	## byte swap data
572	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
573	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
574	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
575	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
576
577	## transpose data into high/low halves
578	vperm2i128	$0x20, XTMP2, XTMP0, X0
579	vperm2i128	$0x31, XTMP2, XTMP0, X1
580	vperm2i128	$0x20, XTMP3, XTMP1, X2
581	vperm2i128	$0x31, XTMP3, XTMP1, X3
582
583last_block_enter:
584	add	$64, INP
585	mov	INP, _INP(%rsp)
586
587	## schedule 48 input dwords, by doing 3 rounds of 12 each
588	xor	SRND, SRND
589
590.align 16
591loop1:
592	vpaddd	K256+0*32(SRND), X0, XFER
593	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
594	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
595
596	vpaddd	K256+1*32(SRND), X0, XFER
597	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
598	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
599
600	vpaddd	K256+2*32(SRND), X0, XFER
601	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
602	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
603
604	vpaddd	K256+3*32(SRND), X0, XFER
605	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
606	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
607
608	add	$4*32, SRND
609	cmp	$3*4*32, SRND
610	jb	loop1
611
612loop2:
613	## Do last 16 rounds with no scheduling
614	vpaddd	K256+0*32(SRND), X0, XFER
615	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
616	DO_4ROUNDS	_XFER + 0*32
617
618	vpaddd	K256+1*32(SRND), X1, XFER
619	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
620	DO_4ROUNDS	_XFER + 1*32
621	add	$2*32, SRND
622
623	vmovdqa	X2, X0
624	vmovdqa	X3, X1
625
626	cmp	$4*4*32, SRND
627	jb	loop2
628
629	mov	_CTX(%rsp), CTX
630	mov	_INP(%rsp), INP
631
632	addm    (4*0)(CTX),a
633	addm    (4*1)(CTX),b
634	addm    (4*2)(CTX),c
635	addm    (4*3)(CTX),d
636	addm    (4*4)(CTX),e
637	addm    (4*5)(CTX),f
638	addm    (4*6)(CTX),g
639	addm    (4*7)(CTX),h
640
641	cmp	_INP_END(%rsp), INP
642	ja	done_hash
643
644	#### Do second block using previously scheduled results
645	xor	SRND, SRND
646.align 16
647loop3:
648	DO_4ROUNDS	 _XFER + 0*32 + 16
649	DO_4ROUNDS	 _XFER + 1*32 + 16
650	add	$2*32, SRND
651	cmp	$4*4*32, SRND
652	jb	loop3
653
654	mov	_CTX(%rsp), CTX
655	mov	_INP(%rsp), INP
656	add	$64, INP
657
658	addm    (4*0)(CTX),a
659	addm    (4*1)(CTX),b
660	addm    (4*2)(CTX),c
661	addm    (4*3)(CTX),d
662	addm    (4*4)(CTX),e
663	addm    (4*5)(CTX),f
664	addm    (4*6)(CTX),g
665	addm    (4*7)(CTX),h
666
667	cmp	_INP_END(%rsp), INP
668	jb	loop0
669	ja	done_hash
670
671do_last_block:
672	VMOVDQ	0*16(INP),XWORD0
673	VMOVDQ	1*16(INP),XWORD1
674	VMOVDQ	2*16(INP),XWORD2
675	VMOVDQ	3*16(INP),XWORD3
676
677	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
678	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
679	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
680	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
681
682	jmp	last_block_enter
683
684only_one_block:
685
686	## load initial digest
687	mov	(4*0)(CTX),a
688	mov	(4*1)(CTX),b
689	mov	(4*2)(CTX),c
690	mov	(4*3)(CTX),d
691	mov	(4*4)(CTX),e
692	mov	(4*5)(CTX),f
693	mov	(4*6)(CTX),g
694	mov	(4*7)(CTX),h
695
696	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
697	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
698	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
699
700	mov	CTX, _CTX(%rsp)
701	jmp	do_last_block
702
703done_hash:
704
705	mov	%rbp, %rsp
706	pop	%rbp
707
708	popq	%r15
709	popq	%r14
710	popq	%r13
711	popq	%r12
712	popq	%rbx
713	RET
714SYM_FUNC_END(sha256_transform_rorx)
715
716.section	.rodata.cst512.K256, "aM", @progbits, 512
717.align 64
718K256:
719	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
720	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
722	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
724	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
726	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
728	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
730	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
732	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
734	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
736	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
738	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
740	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
742	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
744	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
746	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
748	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
750	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751
752.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
753.align 32
754PSHUFFLE_BYTE_FLIP_MASK:
755	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
756
757# shuffle xBxA -> 00BA
758.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
759.align 32
760_SHUF_00BA:
761	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
762
763# shuffle xDxC -> DC00
764.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
765.align 32
766_SHUF_DC00:
767	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
768