xref: /openbmc/linux/arch/x86/crypto/sha1_avx2_x86_64_asm.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1 /*
2  *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * Copyright(c) 2014 Intel Corporation.
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of version 2 of the GNU General Public License as
13  * published by the Free Software Foundation.
14  *
15  * This program is distributed in the hope that it will be useful, but
16  * WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * Contact Information:
21  * Ilya Albrekht <ilya.albrekht@intel.com>
22  * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23  * Ronen Zohar <ronen.zohar@intel.com>
24  * Chandramouli Narayanan <mouli@linux.intel.com>
25  *
26  * BSD LICENSE
27  *
28  * Copyright(c) 2014 Intel Corporation.
29  *
30  * Redistribution and use in source and binary forms, with or without
31  * modification, are permitted provided that the following conditions
32  * are met:
33  *
34  * Redistributions of source code must retain the above copyright
35  * notice, this list of conditions and the following disclaimer.
36  * Redistributions in binary form must reproduce the above copyright
37  * notice, this list of conditions and the following disclaimer in
38  * the documentation and/or other materials provided with the
39  * distribution.
40  * Neither the name of Intel Corporation nor the names of its
41  * contributors may be used to endorse or promote products derived
42  * from this software without specific prior written permission.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55  *
56  */
57 
58 /*
59  * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
60  *
61  *This implementation is based on the previous SSSE3 release:
62  *Visit http://software.intel.com/en-us/articles/
63  *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
64  *
65  *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
66  *even number of 'blocks' consecutive 64-byte blocks.
67  *
68  *extern "C" void sha1_transform_avx2(
69  *	struct sha1_state *state, const u8* input, int blocks );
70  */
71 
72 #include <linux/linkage.h>
73 
74 #define	CTX	%rdi	/* arg1 */
75 #define BUF	%rsi	/* arg2 */
76 #define CNT	%rdx	/* arg3 */
77 
78 #define	REG_A	%ecx
79 #define	REG_B	%esi
80 #define	REG_C	%edi
81 #define	REG_D	%eax
82 #define	REG_E	%edx
83 #define	REG_TB	%ebx
84 #define	REG_TA	%r12d
85 #define	REG_RA	%rcx
86 #define	REG_RB	%rsi
87 #define	REG_RC	%rdi
88 #define	REG_RD	%rax
89 #define	REG_RE	%rdx
90 #define	REG_RTA	%r12
91 #define	REG_RTB	%rbx
92 #define	REG_T1	%r11d
93 #define	xmm_mov	vmovups
94 #define	avx2_zeroupper	vzeroupper
95 #define	RND_F1	1
96 #define	RND_F2	2
97 #define	RND_F3	3
98 
99 .macro REGALLOC
100 	.set A, REG_A
101 	.set B, REG_B
102 	.set C, REG_C
103 	.set D, REG_D
104 	.set E, REG_E
105 	.set TB, REG_TB
106 	.set TA, REG_TA
107 
108 	.set RA, REG_RA
109 	.set RB, REG_RB
110 	.set RC, REG_RC
111 	.set RD, REG_RD
112 	.set RE, REG_RE
113 
114 	.set RTA, REG_RTA
115 	.set RTB, REG_RTB
116 
117 	.set T1, REG_T1
118 .endm
119 
120 #define HASH_PTR	%r9
121 #define BLOCKS_CTR	%r8
122 #define BUFFER_PTR	%r10
123 #define BUFFER_PTR2	%r13
124 
125 #define PRECALC_BUF	%r14
126 #define WK_BUF		%r15
127 
128 #define W_TMP		%xmm0
129 #define WY_TMP		%ymm0
130 #define WY_TMP2		%ymm9
131 
132 # AVX2 variables
133 #define WY0		%ymm3
134 #define WY4		%ymm5
135 #define WY08		%ymm7
136 #define WY12		%ymm8
137 #define WY16		%ymm12
138 #define WY20		%ymm13
139 #define WY24		%ymm14
140 #define WY28		%ymm15
141 
142 #define YMM_SHUFB_BSWAP	%ymm10
143 
144 /*
145  * Keep 2 iterations precalculated at a time:
146  *    - 80 DWORDs per iteration * 2
147  */
148 #define W_SIZE		(80*2*2 +16)
149 
150 #define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
151 #define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
152 
153 
154 .macro UPDATE_HASH  hash, val
155 	add	\hash, \val
156 	mov	\val, \hash
157 .endm
158 
159 .macro PRECALC_RESET_WY
160 	.set WY_00, WY0
161 	.set WY_04, WY4
162 	.set WY_08, WY08
163 	.set WY_12, WY12
164 	.set WY_16, WY16
165 	.set WY_20, WY20
166 	.set WY_24, WY24
167 	.set WY_28, WY28
168 	.set WY_32, WY_00
169 .endm
170 
171 .macro PRECALC_ROTATE_WY
172 	/* Rotate macros */
173 	.set WY_32, WY_28
174 	.set WY_28, WY_24
175 	.set WY_24, WY_20
176 	.set WY_20, WY_16
177 	.set WY_16, WY_12
178 	.set WY_12, WY_08
179 	.set WY_08, WY_04
180 	.set WY_04, WY_00
181 	.set WY_00, WY_32
182 
183 	/* Define register aliases */
184 	.set WY, WY_00
185 	.set WY_minus_04, WY_04
186 	.set WY_minus_08, WY_08
187 	.set WY_minus_12, WY_12
188 	.set WY_minus_16, WY_16
189 	.set WY_minus_20, WY_20
190 	.set WY_minus_24, WY_24
191 	.set WY_minus_28, WY_28
192 	.set WY_minus_32, WY
193 .endm
194 
195 .macro PRECALC_00_15
196 	.if (i == 0) # Initialize and rotate registers
197 		PRECALC_RESET_WY
198 		PRECALC_ROTATE_WY
199 	.endif
200 
201 	/* message scheduling pre-compute for rounds 0-15 */
202 	.if   ((i & 7) == 0)
203 		/*
204 		 * blended AVX2 and ALU instruction scheduling
205 		 * 1 vector iteration per 8 rounds
206 		 */
207 		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
208 	.elseif ((i & 7) == 1)
209 		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
210 			 WY_TMP, WY_TMP
211 	.elseif ((i & 7) == 2)
212 		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
213 	.elseif ((i & 7) == 4)
214 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
215 	.elseif ((i & 7) == 7)
216 		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
217 
218 		PRECALC_ROTATE_WY
219 	.endif
220 .endm
221 
222 .macro PRECALC_16_31
223 	/*
224 	 * message scheduling pre-compute for rounds 16-31
225 	 * calculating last 32 w[i] values in 8 XMM registers
226 	 * pre-calculate K+w[i] values and store to mem
227 	 * for later load by ALU add instruction
228 	 *
229 	 * "brute force" vectorization for rounds 16-31 only
230 	 * due to w[i]->w[i-3] dependency
231 	 */
232 	.if   ((i & 7) == 0)
233 		/*
234 		 * blended AVX2 and ALU instruction scheduling
235 		 * 1 vector iteration per 8 rounds
236 		 */
237 		/* w[i-14] */
238 		vpalignr	$8, WY_minus_16, WY_minus_12, WY
239 		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
240 	.elseif ((i & 7) == 1)
241 		vpxor	WY_minus_08, WY, WY
242 		vpxor	WY_minus_16, WY_TMP, WY_TMP
243 	.elseif ((i & 7) == 2)
244 		vpxor	WY_TMP, WY, WY
245 		vpslldq	$12, WY, WY_TMP2
246 	.elseif ((i & 7) == 3)
247 		vpslld	$1, WY, WY_TMP
248 		vpsrld	$31, WY, WY
249 	.elseif ((i & 7) == 4)
250 		vpor	WY, WY_TMP, WY_TMP
251 		vpslld	$2, WY_TMP2, WY
252 	.elseif ((i & 7) == 5)
253 		vpsrld	$30, WY_TMP2, WY_TMP2
254 		vpxor	WY, WY_TMP, WY_TMP
255 	.elseif ((i & 7) == 7)
256 		vpxor	WY_TMP2, WY_TMP, WY
257 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
258 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
259 
260 		PRECALC_ROTATE_WY
261 	.endif
262 .endm
263 
264 .macro PRECALC_32_79
265 	/*
266 	 * in SHA-1 specification:
267 	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
268 	 * instead we do equal:
269 	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
270 	 * allows more efficient vectorization
271 	 * since w[i]=>w[i-3] dependency is broken
272 	 */
273 
274 	.if   ((i & 7) == 0)
275 	/*
276 	 * blended AVX2 and ALU instruction scheduling
277 	 * 1 vector iteration per 8 rounds
278 	 */
279 		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
280 	.elseif ((i & 7) == 1)
281 		/* W is W_minus_32 before xor */
282 		vpxor	WY_minus_28, WY, WY
283 	.elseif ((i & 7) == 2)
284 		vpxor	WY_minus_16, WY_TMP, WY_TMP
285 	.elseif ((i & 7) == 3)
286 		vpxor	WY_TMP, WY, WY
287 	.elseif ((i & 7) == 4)
288 		vpslld	$2, WY, WY_TMP
289 	.elseif ((i & 7) == 5)
290 		vpsrld	$30, WY, WY
291 		vpor	WY, WY_TMP, WY
292 	.elseif ((i & 7) == 7)
293 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
294 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
295 
296 		PRECALC_ROTATE_WY
297 	.endif
298 .endm
299 
300 .macro PRECALC r, s
301 	.set i, \r
302 
303 	.if (i < 40)
304 		.set K_XMM, 32*0
305 	.elseif (i < 80)
306 		.set K_XMM, 32*1
307 	.elseif (i < 120)
308 		.set K_XMM, 32*2
309 	.else
310 		.set K_XMM, 32*3
311 	.endif
312 
313 	.if (i<32)
314 		PRECALC_00_15	\s
315 	.elseif (i<64)
316 		PRECALC_16_31	\s
317 	.elseif (i < 160)
318 		PRECALC_32_79	\s
319 	.endif
320 .endm
321 
322 .macro ROTATE_STATE
323 	.set T_REG, E
324 	.set E, D
325 	.set D, C
326 	.set C, B
327 	.set B, TB
328 	.set TB, A
329 	.set A, T_REG
330 
331 	.set T_REG, RE
332 	.set RE, RD
333 	.set RD, RC
334 	.set RC, RB
335 	.set RB, RTB
336 	.set RTB, RA
337 	.set RA, T_REG
338 .endm
339 
340 /* Macro relies on saved ROUND_Fx */
341 
342 .macro RND_FUN f, r
343 	.if (\f == RND_F1)
344 		ROUND_F1	\r
345 	.elseif (\f == RND_F2)
346 		ROUND_F2	\r
347 	.elseif (\f == RND_F3)
348 		ROUND_F3	\r
349 	.endif
350 .endm
351 
352 .macro RR r
353 	.set round_id, (\r % 80)
354 
355 	.if (round_id == 0)        /* Precalculate F for first round */
356 		.set ROUND_FUNC, RND_F1
357 		mov	B, TB
358 
359 		rorx	$(32-30), B, B    /* b>>>2 */
360 		andn	D, TB, T1
361 		and	C, TB
362 		xor	T1, TB
363 	.endif
364 
365 	RND_FUN ROUND_FUNC, \r
366 	ROTATE_STATE
367 
368 	.if   (round_id == 18)
369 		.set ROUND_FUNC, RND_F2
370 	.elseif (round_id == 38)
371 		.set ROUND_FUNC, RND_F3
372 	.elseif (round_id == 58)
373 		.set ROUND_FUNC, RND_F2
374 	.endif
375 
376 	.set round_id, ( (\r+1) % 80)
377 
378 	RND_FUN ROUND_FUNC, (\r+1)
379 	ROTATE_STATE
380 .endm
381 
382 .macro ROUND_F1 r
383 	add	WK(\r), E
384 
385 	andn	C, A, T1			/* ~b&d */
386 	lea	(RE,RTB), E		/* Add F from the previous round */
387 
388 	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
389 	rorx	$(32-30),A, TB		/* b>>>2 for next round */
390 
391 	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
392 
393 	/*
394 	 * Calculate F for the next round
395 	 * (b & c) ^ andn[b, d]
396 	 */
397 	and	B, A			/* b&c */
398 	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
399 
400 	lea	(RE,RTA), E		/* E += A >>> 5 */
401 .endm
402 
403 .macro ROUND_F2 r
404 	add	WK(\r), E
405 	lea	(RE,RTB), E		/* Add F from the previous round */
406 
407 	/* Calculate F for the next round */
408 	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
409 	.if ((round_id) < 79)
410 		rorx	$(32-30), A, TB	/* b>>>2 for next round */
411 	.endif
412 	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
413 
414 	.if ((round_id) < 79)
415 		xor	B, A
416 	.endif
417 
418 	add	TA, E			/* E += A >>> 5 */
419 
420 	.if ((round_id) < 79)
421 		xor	C, A
422 	.endif
423 .endm
424 
425 .macro ROUND_F3 r
426 	add	WK(\r), E
427 	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
428 
429 	lea	(RE,RTB), E		/* Add F from the previous round */
430 
431 	mov	B, T1
432 	or	A, T1
433 
434 	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
435 	rorx	$(32-30), A, TB		/* b>>>2 for next round */
436 
437 	/* Calculate F for the next round
438 	 * (b and c) or (d and (b or c))
439 	 */
440 	and	C, T1
441 	and	B, A
442 	or	T1, A
443 
444 	add	TA, E			/* E += A >>> 5 */
445 
446 .endm
447 
448 /* Add constant only if (%2 > %3) condition met (uses RTA as temp)
449  * %1 + %2 >= %3 ? %4 : 0
450  */
451 .macro ADD_IF_GE a, b, c, d
452 	mov     \a, RTA
453 	add     $\d, RTA
454 	cmp     $\c, \b
455 	cmovge  RTA, \a
456 .endm
457 
458 /*
459  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
460  */
461 .macro SHA1_PIPELINED_MAIN_BODY
462 
463 	REGALLOC
464 
465 	mov	(HASH_PTR), A
466 	mov	4(HASH_PTR), B
467 	mov	8(HASH_PTR), C
468 	mov	12(HASH_PTR), D
469 	mov	16(HASH_PTR), E
470 
471 	mov	%rsp, PRECALC_BUF
472 	lea	(2*4*80+32)(%rsp), WK_BUF
473 
474 	# Precalc WK for first 2 blocks
475 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
476 	.set i, 0
477 	.rept    160
478 		PRECALC i
479 		.set i, i + 1
480 	.endr
481 
482 	/* Go to next block if needed */
483 	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
484 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
485 	xchg	WK_BUF, PRECALC_BUF
486 
487 	.align 32
488 .L_loop:
489 	/*
490 	 * code loops through more than one block
491 	 * we use K_BASE value as a signal of a last block,
492 	 * it is set below by: cmovae BUFFER_PTR, K_BASE
493 	 */
494 	test BLOCKS_CTR, BLOCKS_CTR
495 	jnz .L_begin
496 	.align 32
497 	jmp	.L_end
498 	.align 32
499 .L_begin:
500 
501 	/*
502 	 * Do first block
503 	 * rounds: 0,2,4,6,8
504 	 */
505 	.set j, 0
506 	.rept 5
507 		RR	j
508 		.set j, j+2
509 	.endr
510 
511 	/*
512 	 * rounds:
513 	 * 10,12,14,16,18
514 	 * 20,22,24,26,28
515 	 * 30,32,34,36,38
516 	 * 40,42,44,46,48
517 	 * 50,52,54,56,58
518 	 */
519 	.rept 25
520 		RR	j
521 		.set j, j+2
522 	.endr
523 
524 	/* Update Counter */
525 	sub $1, BLOCKS_CTR
526 	/* Move to the next block only if needed*/
527 	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
528 	/*
529 	 * rounds
530 	 * 60,62,64,66,68
531 	 * 70,72,74,76,78
532 	 */
533 	.rept 10
534 		RR	j
535 		.set j, j+2
536 	.endr
537 
538 	UPDATE_HASH	(HASH_PTR), A
539 	UPDATE_HASH	4(HASH_PTR), TB
540 	UPDATE_HASH	8(HASH_PTR), C
541 	UPDATE_HASH	12(HASH_PTR), D
542 	UPDATE_HASH	16(HASH_PTR), E
543 
544 	test	BLOCKS_CTR, BLOCKS_CTR
545 	jz	.L_loop
546 
547 	mov	TB, B
548 
549 	/* Process second block */
550 	/*
551 	 * rounds
552 	 *  0+80, 2+80, 4+80, 6+80, 8+80
553 	 * 10+80,12+80,14+80,16+80,18+80
554 	 */
555 
556 	.set j, 0
557 	.rept 10
558 		RR	j+80
559 		.set j, j+2
560 	.endr
561 
562 	/*
563 	 * rounds
564 	 * 20+80,22+80,24+80,26+80,28+80
565 	 * 30+80,32+80,34+80,36+80,38+80
566 	 */
567 	.rept 10
568 		RR	j+80
569 		.set j, j+2
570 	.endr
571 
572 	/*
573 	 * rounds
574 	 * 40+80,42+80,44+80,46+80,48+80
575 	 * 50+80,52+80,54+80,56+80,58+80
576 	 */
577 	.rept 10
578 		RR	j+80
579 		.set j, j+2
580 	.endr
581 
582 	/* update counter */
583 	sub     $1, BLOCKS_CTR
584 	/* Move to the next block only if needed*/
585 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
586 
587 	/*
588 	 * rounds
589 	 * 60+80,62+80,64+80,66+80,68+80
590 	 * 70+80,72+80,74+80,76+80,78+80
591 	 */
592 	.rept 10
593 		RR	j+80
594 		.set j, j+2
595 	.endr
596 
597 	UPDATE_HASH	(HASH_PTR), A
598 	UPDATE_HASH	4(HASH_PTR), TB
599 	UPDATE_HASH	8(HASH_PTR), C
600 	UPDATE_HASH	12(HASH_PTR), D
601 	UPDATE_HASH	16(HASH_PTR), E
602 
603 	/* Reset state for AVX2 reg permutation */
604 	mov	A, TA
605 	mov	TB, A
606 	mov	C, TB
607 	mov	E, C
608 	mov	D, B
609 	mov	TA, D
610 
611 	REGALLOC
612 
613 	xchg	WK_BUF, PRECALC_BUF
614 
615 	jmp	.L_loop
616 
617 	.align 32
618 .L_end:
619 
620 .endm
621 /*
622  * macro implements SHA-1 function's body for several 64-byte blocks
623  * param: function's name
624  */
625 .macro SHA1_VECTOR_ASM  name
626 	SYM_FUNC_START(\name)
627 
628 	push	%rbx
629 	push	%r12
630 	push	%r13
631 	push	%r14
632 	push	%r15
633 
634 	RESERVE_STACK  = (W_SIZE*4 + 8+24)
635 
636 	/* Align stack */
637 	push	%rbp
638 	mov	%rsp, %rbp
639 	and	$~(0x20-1), %rsp
640 	sub	$RESERVE_STACK, %rsp
641 
642 	avx2_zeroupper
643 
644 	/* Setup initial values */
645 	mov	CTX, HASH_PTR
646 	mov	BUF, BUFFER_PTR
647 
648 	mov	BUF, BUFFER_PTR2
649 	mov	CNT, BLOCKS_CTR
650 
651 	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
652 
653 	SHA1_PIPELINED_MAIN_BODY
654 
655 	avx2_zeroupper
656 
657 	mov	%rbp, %rsp
658 	pop	%rbp
659 
660 	pop	%r15
661 	pop	%r14
662 	pop	%r13
663 	pop	%r12
664 	pop	%rbx
665 
666 	RET
667 
668 	SYM_FUNC_END(\name)
669 .endm
670 
671 .section .rodata
672 
673 #define K1 0x5a827999
674 #define K2 0x6ed9eba1
675 #define K3 0x8f1bbcdc
676 #define K4 0xca62c1d6
677 
678 .align 128
679 K_XMM_AR:
680 	.long K1, K1, K1, K1
681 	.long K1, K1, K1, K1
682 	.long K2, K2, K2, K2
683 	.long K2, K2, K2, K2
684 	.long K3, K3, K3, K3
685 	.long K3, K3, K3, K3
686 	.long K4, K4, K4, K4
687 	.long K4, K4, K4, K4
688 
689 BSWAP_SHUFB_CTL:
690 	.long 0x00010203
691 	.long 0x04050607
692 	.long 0x08090a0b
693 	.long 0x0c0d0e0f
694 	.long 0x00010203
695 	.long 0x04050607
696 	.long 0x08090a0b
697 	.long 0x0c0d0e0f
698 .text
699 
700 SHA1_VECTOR_ASM     sha1_transform_avx2
701