1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52#include <linux/cfi_types.h>
53
54## assume buffers not aligned
55#define	VMOVDQ vmovdqu
56
57################################ Define Macros
58
59# addm [mem], reg
60# Add reg to mem using reg-mem add and store
61.macro addm p1 p2
62	add	\p1, \p2
63	mov	\p2, \p1
64.endm
65
66################################
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73# XMM versions of above
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER  = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
88SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93NUM_BLKS = %rdx	# 3rd arg
94INP	= %rsi  # 2nd arg
95CTX	= %rdi	# 1st arg
96c	= %ecx
97d	= %r8d
98e       = %edx	# clobbers NUM_BLKS
99y3	= %esi	# clobbers INP
100
101SRND	= CTX	# SRND is same register as CTX
102
103a = %eax
104b = %ebx
105f = %r9d
106g = %r10d
107h = %r11d
108old_h = %r11d
109
110T1 = %r12d
111y0 = %r13d
112y1 = %r14d
113y2 = %r15d
114
115
116_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
117_XMM_SAVE_SIZE	= 0
118_INP_END_SIZE	= 8
119_INP_SIZE	= 8
120_CTX_SIZE	= 8
121
122_XFER		= 0
123_XMM_SAVE	= _XFER     + _XFER_SIZE
124_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
125_INP		= _INP_END  + _INP_END_SIZE
126_CTX		= _INP      + _INP_SIZE
127STACK_SIZE	= _CTX      + _CTX_SIZE
128
129# rotate_Xs
130# Rotate values of symbols X0...X3
131.macro rotate_Xs
132	X_ = X0
133	X0 = X1
134	X1 = X2
135	X2 = X3
136	X3 = X_
137.endm
138
139# ROTATE_ARGS
140# Rotate values of symbols a...h
141.macro ROTATE_ARGS
142	old_h = h
143	TMP_ = h
144	h = g
145	g = f
146	f = e
147	e = d
148	d = c
149	c = b
150	b = a
151	a = TMP_
152.endm
153
154.macro FOUR_ROUNDS_AND_SCHED disp
155################################### RND N + 0 ############################
156
157	mov	a, y3		# y3 = a                                # MAJA
158	rorx	$25, e, y0	# y0 = e >> 25				# S1A
159	rorx	$11, e, y1	# y1 = e >> 11				# S1B
160
161	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
162	or	c, y3		# y3 = a|c                              # MAJA
163	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164	mov	f, y2		# y2 = f                                # CH
165	rorx	$13, a, T1	# T1 = a >> 13				# S0B
166
167	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
168	xor	g, y2		# y2 = f^g                              # CH
169	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
170	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
171
172	and	e, y2		# y2 = (f^g)&e                          # CH
173	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
174	rorx	$22, a, y1	# y1 = a >> 22				# S0A
175	add	h, d		# d = k + w + h + d                     # --
176
177	and	b, y3		# y3 = (a|c)&b                          # MAJA
178	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
179	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
180	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
181
182	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
183	vpsrld	$7, XTMP1, XTMP2
184	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
185	mov	a, T1		# T1 = a                                # MAJB
186	and	c, T1		# T1 = a&c                              # MAJB
187
188	add	y0, y2		# y2 = S1 + CH                          # --
189	vpslld	$(32-7), XTMP1, XTMP3
190	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
191	add	y1, h		# h = k + w + h + S0                    # --
192
193	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
194	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
195
196	vpsrld	$18, XTMP1, XTMP2
197	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
198	add	y3, h		# h = t1 + S0 + MAJ                     # --
199
200
201	ROTATE_ARGS
202
203################################### RND N + 1 ############################
204
205	mov	a, y3		# y3 = a                                # MAJA
206	rorx	$25, e, y0	# y0 = e >> 25				# S1A
207	rorx	$11, e, y1	# y1 = e >> 11				# S1B
208	offset = \disp + 1*4
209	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
210	or	c, y3		# y3 = a|c                              # MAJA
211
212
213	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214	mov	f, y2		# y2 = f                                # CH
215	rorx	$13, a, T1	# T1 = a >> 13				# S0B
216	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
217	xor	g, y2		# y2 = f^g                              # CH
218
219
220	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
221	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
222	rorx	$22, a, y1	# y1 = a >> 22				# S0A
223	and	e, y2		# y2 = (f^g)&e                          # CH
224	add	h, d		# d = k + w + h + d                     # --
225
226	vpslld	$(32-18), XTMP1, XTMP1
227	and	b, y3		# y3 = (a|c)&b                          # MAJA
228	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
229
230	vpxor	XTMP1, XTMP3, XTMP3
231	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
232	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
233
234	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
236	mov	a, T1		# T1 = a                                # MAJB
237	and	c, T1		# T1 = a&c                              # MAJB
238	add	y0, y2		# y2 = S1 + CH                          # --
239
240	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
241	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
242	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
243	add	y1, h		# h = k + w + h + S0                    # --
244
245	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
246	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
247	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
248	add	y3, h		# h = t1 + S0 + MAJ                     # --
249
250	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
251
252
253	ROTATE_ARGS
254
255################################### RND N + 2 ############################
256
257	mov	a, y3		# y3 = a                                # MAJA
258	rorx	$25, e, y0	# y0 = e >> 25				# S1A
259	offset = \disp + 2*4
260	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
261
262	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
263	rorx	$11, e, y1	# y1 = e >> 11				# S1B
264	or	c, y3		# y3 = a|c                              # MAJA
265	mov	f, y2		# y2 = f                                # CH
266	xor	g, y2		# y2 = f^g                              # CH
267
268	rorx	$13, a, T1	# T1 = a >> 13				# S0B
269	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
270	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
271	and	e, y2		# y2 = (f^g)&e                          # CH
272
273	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
274	vpxor	XTMP3, XTMP2, XTMP2
275	add	h, d		# d = k + w + h + d                     # --
276	and	b, y3		# y3 = (a|c)&b                          # MAJA
277
278	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
279	rorx	$22, a, y1	# y1 = a >> 22				# S0A
280	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
281	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
282
283	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
284	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
285	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
286	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
287
288	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
289	mov	a, T1		# T1 = a                                # MAJB
290	and	c, T1		# T1 = a&c                              # MAJB
291	add	y0, y2		# y2 = S1 + CH                          # --
292	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
293
294	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
295	add	y1,h		# h = k + w + h + S0                    # --
296	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
297	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
298
299	add	y3,h		# h = t1 + S0 + MAJ                     # --
300
301
302	ROTATE_ARGS
303
304################################### RND N + 3 ############################
305
306	mov	a, y3		# y3 = a                                # MAJA
307	rorx	$25, e, y0	# y0 = e >> 25				# S1A
308	rorx	$11, e, y1	# y1 = e >> 11				# S1B
309	offset = \disp + 3*4
310	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
311	or	c, y3		# y3 = a|c                              # MAJA
312
313
314	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
315	mov	f, y2		# y2 = f                                # CH
316	rorx	$13, a, T1	# T1 = a >> 13				# S0B
317	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
318	xor	g, y2		# y2 = f^g                              # CH
319
320
321	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
322	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
323	and	e, y2		# y2 = (f^g)&e                          # CH
324	add	h, d		# d = k + w + h + d                     # --
325	and	b, y3		# y3 = (a|c)&b                          # MAJA
326
327	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
328	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
329	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
330
331	vpxor	XTMP3, XTMP2, XTMP2
332	rorx	$22, a, y1	# y1 = a >> 22				# S0A
333	add	y0, y2		# y2 = S1 + CH                          # --
334
335	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
336	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
337	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
338
339	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
340	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
341
342	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
343	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
344	mov	a, T1		# T1 = a                                # MAJB
345	and	c, T1		# T1 = a&c                              # MAJB
346	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
347
348	add	y1, h		# h = k + w + h + S0                    # --
349	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
350	add	y3, h		# h = t1 + S0 + MAJ                     # --
351
352	ROTATE_ARGS
353	rotate_Xs
354.endm
355
356.macro DO_4ROUNDS disp
357################################### RND N + 0 ###########################
358
359	mov	f, y2		# y2 = f                                # CH
360	rorx	$25, e, y0	# y0 = e >> 25				# S1A
361	rorx	$11, e, y1	# y1 = e >> 11				# S1B
362	xor	g, y2		# y2 = f^g                              # CH
363
364	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
365	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
366	and	e, y2		# y2 = (f^g)&e                          # CH
367
368	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
369	rorx	$13, a, T1	# T1 = a >> 13				# S0B
370	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
371	rorx	$22, a, y1	# y1 = a >> 22				# S0A
372	mov	a, y3		# y3 = a                                # MAJA
373
374	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
375	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
376	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
377	or	c, y3		# y3 = a|c                              # MAJA
378
379	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
380	mov	a, T1		# T1 = a                                # MAJB
381	and	b, y3		# y3 = (a|c)&b                          # MAJA
382	and	c, T1		# T1 = a&c                              # MAJB
383	add	y0, y2		# y2 = S1 + CH                          # --
384
385
386	add	h, d		# d = k + w + h + d                     # --
387	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
388	add	y1, h		# h = k + w + h + S0                    # --
389	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
390
391	ROTATE_ARGS
392
393################################### RND N + 1 ###########################
394
395	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
396	mov	f, y2		# y2 = f                                # CH
397	rorx	$25, e, y0	# y0 = e >> 25				# S1A
398	rorx	$11, e, y1	# y1 = e >> 11				# S1B
399	xor	g, y2		# y2 = f^g                              # CH
400
401	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
402	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
403	and	e, y2		# y2 = (f^g)&e                          # CH
404	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
405
406	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
407	rorx	$13, a, T1	# T1 = a >> 13				# S0B
408	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
409	rorx	$22, a, y1	# y1 = a >> 22				# S0A
410	mov	a, y3		# y3 = a                                # MAJA
411
412	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
413	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
414	offset = 4*1 + \disp
415	addl	offset(%rsp, SRND), h		# h = k + w + h # --
416	or	c, y3		# y3 = a|c                              # MAJA
417
418	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
419	mov	a, T1		# T1 = a                                # MAJB
420	and	b, y3		# y3 = (a|c)&b                          # MAJA
421	and	c, T1		# T1 = a&c                              # MAJB
422	add	y0, y2		# y2 = S1 + CH                          # --
423
424
425	add	h, d		# d = k + w + h + d                     # --
426	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
427	add	y1, h		# h = k + w + h + S0                    # --
428
429	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
430
431	ROTATE_ARGS
432
433################################### RND N + 2 ##############################
434
435	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
436	mov	f, y2		# y2 = f                                # CH
437	rorx	$25, e, y0	# y0 = e >> 25				# S1A
438	rorx	$11, e, y1	# y1 = e >> 11				# S1B
439	xor	g, y2		# y2 = f^g                              # CH
440
441	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
442	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
443	and	e, y2		# y2 = (f^g)&e                          # CH
444	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
445
446	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
447	rorx	$13, a, T1	# T1 = a >> 13				# S0B
448	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
449	rorx	$22, a, y1	# y1 = a >> 22				# S0A
450	mov	a, y3		# y3 = a                                # MAJA
451
452	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
453	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
454	offset = 4*2 + \disp
455	addl	offset(%rsp, SRND), h		# h = k + w + h # --
456	or	c, y3		# y3 = a|c                              # MAJA
457
458	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
459	mov	a, T1		# T1 = a                                # MAJB
460	and	b, y3		# y3 = (a|c)&b                          # MAJA
461	and	c, T1		# T1 = a&c                              # MAJB
462	add	y0, y2		# y2 = S1 + CH                          # --
463
464
465	add	h, d		# d = k + w + h + d                     # --
466	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
467	add	y1, h		# h = k + w + h + S0                    # --
468
469	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
470
471	ROTATE_ARGS
472
473################################### RND N + 3 ###########################
474
475	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
476	mov	f, y2		# y2 = f                                # CH
477	rorx	$25, e, y0	# y0 = e >> 25				# S1A
478	rorx	$11, e, y1	# y1 = e >> 11				# S1B
479	xor	g, y2		# y2 = f^g                              # CH
480
481	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
482	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
483	and	e, y2		# y2 = (f^g)&e                          # CH
484	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
485
486	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
487	rorx	$13, a, T1	# T1 = a >> 13				# S0B
488	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
489	rorx	$22, a, y1	# y1 = a >> 22				# S0A
490	mov	a, y3		# y3 = a                                # MAJA
491
492	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
493	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
494	offset = 4*3 + \disp
495	addl	offset(%rsp, SRND), h		# h = k + w + h # --
496	or	c, y3		# y3 = a|c                              # MAJA
497
498	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
499	mov	a, T1		# T1 = a                                # MAJB
500	and	b, y3		# y3 = (a|c)&b                          # MAJA
501	and	c, T1		# T1 = a&c                              # MAJB
502	add	y0, y2		# y2 = S1 + CH                          # --
503
504
505	add	h, d		# d = k + w + h + d                     # --
506	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
507	add	y1, h		# h = k + w + h + S0                    # --
508
509	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
510
511
512	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
513
514	add	y3, h		# h = t1 + S0 + MAJ                     # --
515
516	ROTATE_ARGS
517
518.endm
519
520########################################################################
521## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
522## arg 1 : pointer to state
523## arg 2 : pointer to input data
524## arg 3 : Num blocks
525########################################################################
526.text
527SYM_TYPED_FUNC_START(sha256_transform_rorx)
528	pushq	%rbx
529	pushq	%r12
530	pushq	%r13
531	pushq	%r14
532	pushq	%r15
533
534	push	%rbp
535	mov	%rsp, %rbp
536
537	subq	$STACK_SIZE, %rsp
538	and	$-32, %rsp	# align rsp to 32 byte boundary
539
540	shl	$6, NUM_BLKS	# convert to bytes
541	jz	.Ldone_hash
542	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
543	mov	NUM_BLKS, _INP_END(%rsp)
544
545	cmp	NUM_BLKS, INP
546	je	.Lonly_one_block
547
548	## load initial digest
549	mov	(CTX), a
550	mov	4*1(CTX), b
551	mov	4*2(CTX), c
552	mov	4*3(CTX), d
553	mov	4*4(CTX), e
554	mov	4*5(CTX), f
555	mov	4*6(CTX), g
556	mov	4*7(CTX), h
557
558	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
560	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
561
562	mov	CTX, _CTX(%rsp)
563
564.Lloop0:
565	## Load first 16 dwords from two blocks
566	VMOVDQ	0*32(INP),XTMP0
567	VMOVDQ	1*32(INP),XTMP1
568	VMOVDQ	2*32(INP),XTMP2
569	VMOVDQ	3*32(INP),XTMP3
570
571	## byte swap data
572	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
573	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
574	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
575	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
576
577	## transpose data into high/low halves
578	vperm2i128	$0x20, XTMP2, XTMP0, X0
579	vperm2i128	$0x31, XTMP2, XTMP0, X1
580	vperm2i128	$0x20, XTMP3, XTMP1, X2
581	vperm2i128	$0x31, XTMP3, XTMP1, X3
582
583.Llast_block_enter:
584	add	$64, INP
585	mov	INP, _INP(%rsp)
586
587	## schedule 48 input dwords, by doing 3 rounds of 12 each
588	xor	SRND, SRND
589
590.align 16
591.Lloop1:
592	leaq	K256+0*32(%rip), INP		## reuse INP as scratch reg
593	vpaddd	(INP, SRND), X0, XFER
594	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
595	FOUR_ROUNDS_AND_SCHED	(_XFER + 0*32)
596
597	leaq	K256+1*32(%rip), INP
598	vpaddd	(INP, SRND), X0, XFER
599	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600	FOUR_ROUNDS_AND_SCHED	(_XFER + 1*32)
601
602	leaq	K256+2*32(%rip), INP
603	vpaddd	(INP, SRND), X0, XFER
604	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605	FOUR_ROUNDS_AND_SCHED	(_XFER + 2*32)
606
607	leaq	K256+3*32(%rip), INP
608	vpaddd	(INP, SRND), X0, XFER
609	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
610	FOUR_ROUNDS_AND_SCHED	(_XFER + 3*32)
611
612	add	$4*32, SRND
613	cmp	$3*4*32, SRND
614	jb	.Lloop1
615
616.Lloop2:
617	## Do last 16 rounds with no scheduling
618	leaq	K256+0*32(%rip), INP
619	vpaddd	(INP, SRND), X0, XFER
620	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
621	DO_4ROUNDS	(_XFER + 0*32)
622
623	leaq	K256+1*32(%rip), INP
624	vpaddd	(INP, SRND), X1, XFER
625	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
626	DO_4ROUNDS	(_XFER + 1*32)
627	add	$2*32, SRND
628
629	vmovdqa	X2, X0
630	vmovdqa	X3, X1
631
632	cmp	$4*4*32, SRND
633	jb	.Lloop2
634
635	mov	_CTX(%rsp), CTX
636	mov	_INP(%rsp), INP
637
638	addm    (4*0)(CTX),a
639	addm    (4*1)(CTX),b
640	addm    (4*2)(CTX),c
641	addm    (4*3)(CTX),d
642	addm    (4*4)(CTX),e
643	addm    (4*5)(CTX),f
644	addm    (4*6)(CTX),g
645	addm    (4*7)(CTX),h
646
647	cmp	_INP_END(%rsp), INP
648	ja	.Ldone_hash
649
650	#### Do second block using previously scheduled results
651	xor	SRND, SRND
652.align 16
653.Lloop3:
654	DO_4ROUNDS	(_XFER + 0*32 + 16)
655	DO_4ROUNDS	(_XFER + 1*32 + 16)
656	add	$2*32, SRND
657	cmp	$4*4*32, SRND
658	jb	.Lloop3
659
660	mov	_CTX(%rsp), CTX
661	mov	_INP(%rsp), INP
662	add	$64, INP
663
664	addm    (4*0)(CTX),a
665	addm    (4*1)(CTX),b
666	addm    (4*2)(CTX),c
667	addm    (4*3)(CTX),d
668	addm    (4*4)(CTX),e
669	addm    (4*5)(CTX),f
670	addm    (4*6)(CTX),g
671	addm    (4*7)(CTX),h
672
673	cmp	_INP_END(%rsp), INP
674	jb	.Lloop0
675	ja	.Ldone_hash
676
677.Ldo_last_block:
678	VMOVDQ	0*16(INP),XWORD0
679	VMOVDQ	1*16(INP),XWORD1
680	VMOVDQ	2*16(INP),XWORD2
681	VMOVDQ	3*16(INP),XWORD3
682
683	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
684	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
685	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
686	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
687
688	jmp	.Llast_block_enter
689
690.Lonly_one_block:
691
692	## load initial digest
693	mov	(4*0)(CTX),a
694	mov	(4*1)(CTX),b
695	mov	(4*2)(CTX),c
696	mov	(4*3)(CTX),d
697	mov	(4*4)(CTX),e
698	mov	(4*5)(CTX),f
699	mov	(4*6)(CTX),g
700	mov	(4*7)(CTX),h
701
702	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
703	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
704	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
705
706	mov	CTX, _CTX(%rsp)
707	jmp	.Ldo_last_block
708
709.Ldone_hash:
710
711	mov	%rbp, %rsp
712	pop	%rbp
713
714	popq	%r15
715	popq	%r14
716	popq	%r13
717	popq	%r12
718	popq	%rbx
719	vzeroupper
720	RET
721SYM_FUNC_END(sha256_transform_rorx)
722
723.section	.rodata.cst512.K256, "aM", @progbits, 512
724.align 64
725K256:
726	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
727	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
728	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
729	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
730	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
731	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
732	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
733	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
734	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
735	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
736	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
737	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
738	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
739	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
740	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
741	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
742	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
743	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
744	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
745	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
746	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
747	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
748	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
749	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
750	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
751	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
752	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
753	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
754	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
755	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
756	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
757	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
758
759.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
760.align 32
761PSHUFFLE_BYTE_FLIP_MASK:
762	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
763
764# shuffle xBxA -> 00BA
765.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
766.align 32
767_SHUF_00BA:
768	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
769
770# shuffle xDxC -> DC00
771.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
772.align 32
773_SHUF_DC00:
774	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
775