1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52
53## assume buffers not aligned
54#define	VMOVDQ vmovdqu
55
56################################ Define Macros
57
58# addm [mem], reg
59# Add reg to mem using reg-mem add and store
60.macro addm p1 p2
61	add	\p1, \p2
62	mov	\p2, \p1
63.endm
64
65################################
66
67X0 = %ymm4
68X1 = %ymm5
69X2 = %ymm6
70X3 = %ymm7
71
72# XMM versions of above
73XWORD0 = %xmm4
74XWORD1 = %xmm5
75XWORD2 = %xmm6
76XWORD3 = %xmm7
77
78XTMP0 = %ymm0
79XTMP1 = %ymm1
80XTMP2 = %ymm2
81XTMP3 = %ymm3
82XTMP4 = %ymm8
83XFER  = %ymm9
84XTMP5 = %ymm11
85
86SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
87SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %ymm13
89
90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91
92NUM_BLKS = %rdx	# 3rd arg
93INP	= %rsi  # 2nd arg
94CTX	= %rdi	# 1st arg
95c	= %ecx
96d	= %r8d
97e       = %edx	# clobbers NUM_BLKS
98y3	= %esi	# clobbers INP
99
100SRND	= CTX	# SRND is same register as CTX
101
102a = %eax
103b = %ebx
104f = %r9d
105g = %r10d
106h = %r11d
107old_h = %r11d
108
109T1 = %r12d
110y0 = %r13d
111y1 = %r14d
112y2 = %r15d
113
114
115_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
116_XMM_SAVE_SIZE	= 0
117_INP_END_SIZE	= 8
118_INP_SIZE	= 8
119_CTX_SIZE	= 8
120_RSP_SIZE	= 8
121
122_XFER		= 0
123_XMM_SAVE	= _XFER     + _XFER_SIZE
124_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
125_INP		= _INP_END  + _INP_END_SIZE
126_CTX		= _INP      + _INP_SIZE
127_RSP		= _CTX      + _CTX_SIZE
128STACK_SIZE	= _RSP      + _RSP_SIZE
129
130# rotate_Xs
131# Rotate values of symbols X0...X3
132.macro rotate_Xs
133	X_ = X0
134	X0 = X1
135	X1 = X2
136	X2 = X3
137	X3 = X_
138.endm
139
140# ROTATE_ARGS
141# Rotate values of symbols a...h
142.macro ROTATE_ARGS
143	old_h = h
144	TMP_ = h
145	h = g
146	g = f
147	f = e
148	e = d
149	d = c
150	c = b
151	b = a
152	a = TMP_
153.endm
154
155.macro FOUR_ROUNDS_AND_SCHED disp
156################################### RND N + 0 ############################
157
158	mov	a, y3		# y3 = a                                # MAJA
159	rorx	$25, e, y0	# y0 = e >> 25				# S1A
160	rorx	$11, e, y1	# y1 = e >> 11				# S1B
161
162	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
163	or	c, y3		# y3 = a|c                              # MAJA
164	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165	mov	f, y2		# y2 = f                                # CH
166	rorx	$13, a, T1	# T1 = a >> 13				# S0B
167
168	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
169	xor	g, y2		# y2 = f^g                              # CH
170	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
171	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
172
173	and	e, y2		# y2 = (f^g)&e                          # CH
174	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
175	rorx	$22, a, y1	# y1 = a >> 22				# S0A
176	add	h, d		# d = k + w + h + d                     # --
177
178	and	b, y3		# y3 = (a|c)&b                          # MAJA
179	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
180	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
181	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
182
183	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
184	vpsrld	$7, XTMP1, XTMP2
185	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
186	mov	a, T1		# T1 = a                                # MAJB
187	and	c, T1		# T1 = a&c                              # MAJB
188
189	add	y0, y2		# y2 = S1 + CH                          # --
190	vpslld	$(32-7), XTMP1, XTMP3
191	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
192	add	y1, h		# h = k + w + h + S0                    # --
193
194	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
195	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
196
197	vpsrld	$18, XTMP1, XTMP2
198	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
199	add	y3, h		# h = t1 + S0 + MAJ                     # --
200
201
202	ROTATE_ARGS
203
204################################### RND N + 1 ############################
205
206	mov	a, y3		# y3 = a                                # MAJA
207	rorx	$25, e, y0	# y0 = e >> 25				# S1A
208	rorx	$11, e, y1	# y1 = e >> 11				# S1B
209	offset = \disp + 1*4
210	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
211	or	c, y3		# y3 = a|c                              # MAJA
212
213
214	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215	mov	f, y2		# y2 = f                                # CH
216	rorx	$13, a, T1	# T1 = a >> 13				# S0B
217	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
218	xor	g, y2		# y2 = f^g                              # CH
219
220
221	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
222	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
223	rorx	$22, a, y1	# y1 = a >> 22				# S0A
224	and	e, y2		# y2 = (f^g)&e                          # CH
225	add	h, d		# d = k + w + h + d                     # --
226
227	vpslld	$(32-18), XTMP1, XTMP1
228	and	b, y3		# y3 = (a|c)&b                          # MAJA
229	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
230
231	vpxor	XTMP1, XTMP3, XTMP3
232	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
233	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
234
235	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
236	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
237	mov	a, T1		# T1 = a                                # MAJB
238	and	c, T1		# T1 = a&c                              # MAJB
239	add	y0, y2		# y2 = S1 + CH                          # --
240
241	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
242	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
243	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
244	add	y1, h		# h = k + w + h + S0                    # --
245
246	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
247	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
248	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
249	add	y3, h		# h = t1 + S0 + MAJ                     # --
250
251	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
252
253
254	ROTATE_ARGS
255
256################################### RND N + 2 ############################
257
258	mov	a, y3		# y3 = a                                # MAJA
259	rorx	$25, e, y0	# y0 = e >> 25				# S1A
260	offset = \disp + 2*4
261	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
262
263	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
264	rorx	$11, e, y1	# y1 = e >> 11				# S1B
265	or	c, y3		# y3 = a|c                              # MAJA
266	mov	f, y2		# y2 = f                                # CH
267	xor	g, y2		# y2 = f^g                              # CH
268
269	rorx	$13, a, T1	# T1 = a >> 13				# S0B
270	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
271	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
272	and	e, y2		# y2 = (f^g)&e                          # CH
273
274	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
275	vpxor	XTMP3, XTMP2, XTMP2
276	add	h, d		# d = k + w + h + d                     # --
277	and	b, y3		# y3 = (a|c)&b                          # MAJA
278
279	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
280	rorx	$22, a, y1	# y1 = a >> 22				# S0A
281	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
282	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
283
284	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
285	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
286	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
287	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
288
289	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
290	mov	a, T1		# T1 = a                                # MAJB
291	and	c, T1		# T1 = a&c                              # MAJB
292	add	y0, y2		# y2 = S1 + CH                          # --
293	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
294
295	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
296	add	y1,h		# h = k + w + h + S0                    # --
297	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
298	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
299
300	add	y3,h		# h = t1 + S0 + MAJ                     # --
301
302
303	ROTATE_ARGS
304
305################################### RND N + 3 ############################
306
307	mov	a, y3		# y3 = a                                # MAJA
308	rorx	$25, e, y0	# y0 = e >> 25				# S1A
309	rorx	$11, e, y1	# y1 = e >> 11				# S1B
310	offset = \disp + 3*4
311	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
312	or	c, y3		# y3 = a|c                              # MAJA
313
314
315	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
316	mov	f, y2		# y2 = f                                # CH
317	rorx	$13, a, T1	# T1 = a >> 13				# S0B
318	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
319	xor	g, y2		# y2 = f^g                              # CH
320
321
322	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
323	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
324	and	e, y2		# y2 = (f^g)&e                          # CH
325	add	h, d		# d = k + w + h + d                     # --
326	and	b, y3		# y3 = (a|c)&b                          # MAJA
327
328	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
329	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
330	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
331
332	vpxor	XTMP3, XTMP2, XTMP2
333	rorx	$22, a, y1	# y1 = a >> 22				# S0A
334	add	y0, y2		# y2 = S1 + CH                          # --
335
336	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
337	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
338	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
339
340	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
341	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
342
343	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
344	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
345	mov	a, T1		# T1 = a                                # MAJB
346	and	c, T1		# T1 = a&c                              # MAJB
347	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
348
349	add	y1, h		# h = k + w + h + S0                    # --
350	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
351	add	y3, h		# h = t1 + S0 + MAJ                     # --
352
353	ROTATE_ARGS
354	rotate_Xs
355.endm
356
357.macro DO_4ROUNDS disp
358################################### RND N + 0 ###########################
359
360	mov	f, y2		# y2 = f                                # CH
361	rorx	$25, e, y0	# y0 = e >> 25				# S1A
362	rorx	$11, e, y1	# y1 = e >> 11				# S1B
363	xor	g, y2		# y2 = f^g                              # CH
364
365	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
366	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
367	and	e, y2		# y2 = (f^g)&e                          # CH
368
369	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
370	rorx	$13, a, T1	# T1 = a >> 13				# S0B
371	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
372	rorx	$22, a, y1	# y1 = a >> 22				# S0A
373	mov	a, y3		# y3 = a                                # MAJA
374
375	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
376	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
377	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
378	or	c, y3		# y3 = a|c                              # MAJA
379
380	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
381	mov	a, T1		# T1 = a                                # MAJB
382	and	b, y3		# y3 = (a|c)&b                          # MAJA
383	and	c, T1		# T1 = a&c                              # MAJB
384	add	y0, y2		# y2 = S1 + CH                          # --
385
386
387	add	h, d		# d = k + w + h + d                     # --
388	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
389	add	y1, h		# h = k + w + h + S0                    # --
390	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
391
392	ROTATE_ARGS
393
394################################### RND N + 1 ###########################
395
396	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
397	mov	f, y2		# y2 = f                                # CH
398	rorx	$25, e, y0	# y0 = e >> 25				# S1A
399	rorx	$11, e, y1	# y1 = e >> 11				# S1B
400	xor	g, y2		# y2 = f^g                              # CH
401
402	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
403	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
404	and	e, y2		# y2 = (f^g)&e                          # CH
405	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
406
407	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
408	rorx	$13, a, T1	# T1 = a >> 13				# S0B
409	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
410	rorx	$22, a, y1	# y1 = a >> 22				# S0A
411	mov	a, y3		# y3 = a                                # MAJA
412
413	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
414	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
415	offset = 4*1 + \disp
416	addl	offset(%rsp, SRND), h		# h = k + w + h # --
417	or	c, y3		# y3 = a|c                              # MAJA
418
419	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
420	mov	a, T1		# T1 = a                                # MAJB
421	and	b, y3		# y3 = (a|c)&b                          # MAJA
422	and	c, T1		# T1 = a&c                              # MAJB
423	add	y0, y2		# y2 = S1 + CH                          # --
424
425
426	add	h, d		# d = k + w + h + d                     # --
427	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
428	add	y1, h		# h = k + w + h + S0                    # --
429
430	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
431
432	ROTATE_ARGS
433
434################################### RND N + 2 ##############################
435
436	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
437	mov	f, y2		# y2 = f                                # CH
438	rorx	$25, e, y0	# y0 = e >> 25				# S1A
439	rorx	$11, e, y1	# y1 = e >> 11				# S1B
440	xor	g, y2		# y2 = f^g                              # CH
441
442	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
443	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
444	and	e, y2		# y2 = (f^g)&e                          # CH
445	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
446
447	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
448	rorx	$13, a, T1	# T1 = a >> 13				# S0B
449	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
450	rorx	$22, a, y1	# y1 = a >> 22				# S0A
451	mov	a, y3		# y3 = a                                # MAJA
452
453	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
454	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
455	offset = 4*2 + \disp
456	addl	offset(%rsp, SRND), h		# h = k + w + h # --
457	or	c, y3		# y3 = a|c                              # MAJA
458
459	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
460	mov	a, T1		# T1 = a                                # MAJB
461	and	b, y3		# y3 = (a|c)&b                          # MAJA
462	and	c, T1		# T1 = a&c                              # MAJB
463	add	y0, y2		# y2 = S1 + CH                          # --
464
465
466	add	h, d		# d = k + w + h + d                     # --
467	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
468	add	y1, h		# h = k + w + h + S0                    # --
469
470	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
471
472	ROTATE_ARGS
473
474################################### RND N + 3 ###########################
475
476	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
477	mov	f, y2		# y2 = f                                # CH
478	rorx	$25, e, y0	# y0 = e >> 25				# S1A
479	rorx	$11, e, y1	# y1 = e >> 11				# S1B
480	xor	g, y2		# y2 = f^g                              # CH
481
482	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
483	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
484	and	e, y2		# y2 = (f^g)&e                          # CH
485	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
486
487	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
488	rorx	$13, a, T1	# T1 = a >> 13				# S0B
489	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
490	rorx	$22, a, y1	# y1 = a >> 22				# S0A
491	mov	a, y3		# y3 = a                                # MAJA
492
493	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
494	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
495	offset = 4*3 + \disp
496	addl	offset(%rsp, SRND), h		# h = k + w + h # --
497	or	c, y3		# y3 = a|c                              # MAJA
498
499	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
500	mov	a, T1		# T1 = a                                # MAJB
501	and	b, y3		# y3 = (a|c)&b                          # MAJA
502	and	c, T1		# T1 = a&c                              # MAJB
503	add	y0, y2		# y2 = S1 + CH                          # --
504
505
506	add	h, d		# d = k + w + h + d                     # --
507	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
508	add	y1, h		# h = k + w + h + S0                    # --
509
510	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
511
512
513	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
514
515	add	y3, h		# h = t1 + S0 + MAJ                     # --
516
517	ROTATE_ARGS
518
519.endm
520
521########################################################################
522## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
523## arg 1 : pointer to state
524## arg 2 : pointer to input data
525## arg 3 : Num blocks
526########################################################################
527.text
528SYM_FUNC_START(sha256_transform_rorx)
529.align 32
530	pushq	%rbx
531	pushq	%r12
532	pushq	%r13
533	pushq	%r14
534	pushq	%r15
535
536	mov	%rsp, %rax
537	subq	$STACK_SIZE, %rsp
538	and	$-32, %rsp	# align rsp to 32 byte boundary
539	mov	%rax, _RSP(%rsp)
540
541
542	shl	$6, NUM_BLKS	# convert to bytes
543	jz	done_hash
544	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
545	mov	NUM_BLKS, _INP_END(%rsp)
546
547	cmp	NUM_BLKS, INP
548	je	only_one_block
549
550	## load initial digest
551	mov	(CTX), a
552	mov	4*1(CTX), b
553	mov	4*2(CTX), c
554	mov	4*3(CTX), d
555	mov	4*4(CTX), e
556	mov	4*5(CTX), f
557	mov	4*6(CTX), g
558	mov	4*7(CTX), h
559
560	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
561	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
562	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
563
564	mov	CTX, _CTX(%rsp)
565
566loop0:
567	## Load first 16 dwords from two blocks
568	VMOVDQ	0*32(INP),XTMP0
569	VMOVDQ	1*32(INP),XTMP1
570	VMOVDQ	2*32(INP),XTMP2
571	VMOVDQ	3*32(INP),XTMP3
572
573	## byte swap data
574	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
575	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
576	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
577	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
578
579	## transpose data into high/low halves
580	vperm2i128	$0x20, XTMP2, XTMP0, X0
581	vperm2i128	$0x31, XTMP2, XTMP0, X1
582	vperm2i128	$0x20, XTMP3, XTMP1, X2
583	vperm2i128	$0x31, XTMP3, XTMP1, X3
584
585last_block_enter:
586	add	$64, INP
587	mov	INP, _INP(%rsp)
588
589	## schedule 48 input dwords, by doing 3 rounds of 12 each
590	xor	SRND, SRND
591
592.align 16
593loop1:
594	vpaddd	K256+0*32(SRND), X0, XFER
595	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
596	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
597
598	vpaddd	K256+1*32(SRND), X0, XFER
599	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
601
602	vpaddd	K256+2*32(SRND), X0, XFER
603	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
604	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
605
606	vpaddd	K256+3*32(SRND), X0, XFER
607	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
608	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
609
610	add	$4*32, SRND
611	cmp	$3*4*32, SRND
612	jb	loop1
613
614loop2:
615	## Do last 16 rounds with no scheduling
616	vpaddd	K256+0*32(SRND), X0, XFER
617	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
618	DO_4ROUNDS	_XFER + 0*32
619
620	vpaddd	K256+1*32(SRND), X1, XFER
621	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622	DO_4ROUNDS	_XFER + 1*32
623	add	$2*32, SRND
624
625	vmovdqa	X2, X0
626	vmovdqa	X3, X1
627
628	cmp	$4*4*32, SRND
629	jb	loop2
630
631	mov	_CTX(%rsp), CTX
632	mov	_INP(%rsp), INP
633
634	addm    (4*0)(CTX),a
635	addm    (4*1)(CTX),b
636	addm    (4*2)(CTX),c
637	addm    (4*3)(CTX),d
638	addm    (4*4)(CTX),e
639	addm    (4*5)(CTX),f
640	addm    (4*6)(CTX),g
641	addm    (4*7)(CTX),h
642
643	cmp	_INP_END(%rsp), INP
644	ja	done_hash
645
646	#### Do second block using previously scheduled results
647	xor	SRND, SRND
648.align 16
649loop3:
650	DO_4ROUNDS	 _XFER + 0*32 + 16
651	DO_4ROUNDS	 _XFER + 1*32 + 16
652	add	$2*32, SRND
653	cmp	$4*4*32, SRND
654	jb	loop3
655
656	mov	_CTX(%rsp), CTX
657	mov	_INP(%rsp), INP
658	add	$64, INP
659
660	addm    (4*0)(CTX),a
661	addm    (4*1)(CTX),b
662	addm    (4*2)(CTX),c
663	addm    (4*3)(CTX),d
664	addm    (4*4)(CTX),e
665	addm    (4*5)(CTX),f
666	addm    (4*6)(CTX),g
667	addm    (4*7)(CTX),h
668
669	cmp	_INP_END(%rsp), INP
670	jb	loop0
671	ja	done_hash
672
673do_last_block:
674	VMOVDQ	0*16(INP),XWORD0
675	VMOVDQ	1*16(INP),XWORD1
676	VMOVDQ	2*16(INP),XWORD2
677	VMOVDQ	3*16(INP),XWORD3
678
679	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
680	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
681	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
682	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
683
684	jmp	last_block_enter
685
686only_one_block:
687
688	## load initial digest
689	mov	(4*0)(CTX),a
690	mov	(4*1)(CTX),b
691	mov	(4*2)(CTX),c
692	mov	(4*3)(CTX),d
693	mov	(4*4)(CTX),e
694	mov	(4*5)(CTX),f
695	mov	(4*6)(CTX),g
696	mov	(4*7)(CTX),h
697
698	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
700	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
701
702	mov	CTX, _CTX(%rsp)
703	jmp	do_last_block
704
705done_hash:
706
707	mov	_RSP(%rsp), %rsp
708
709	popq	%r15
710	popq	%r14
711	popq	%r13
712	popq	%r12
713	popq	%rbx
714	ret
715SYM_FUNC_END(sha256_transform_rorx)
716
717.section	.rodata.cst512.K256, "aM", @progbits, 512
718.align 64
719K256:
720	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752
753.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
754.align 32
755PSHUFFLE_BYTE_FLIP_MASK:
756	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
757
758# shuffle xBxA -> 00BA
759.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
760.align 32
761_SHUF_00BA:
762	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
763
764# shuffle xDxC -> DC00
765.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
766.align 32
767_SHUF_DC00:
768	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
769