Lines Matching +full:3 +full:- +full:d
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
17 …* http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorith…
62 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
67 * This macro implements the SHA-1 function's body for single 64-byte block
109 * This macro implements 80 rounds of SHA-1 for one 64-byte block
117 mov 12(HASH_PTR), D
128 RR F1,A,B,C,D,E,0
129 RR F1,D,E,A,B,C,2
130 RR F1,B,C,D,E,A,4
131 RR F1,E,A,B,C,D,6
132 RR F1,C,D,E,A,B,8
134 RR F1,A,B,C,D,E,10
135 RR F1,D,E,A,B,C,12
136 RR F1,B,C,D,E,A,14
137 RR F1,E,A,B,C,D,16
138 RR F1,C,D,E,A,B,18
140 RR F2,A,B,C,D,E,20
141 RR F2,D,E,A,B,C,22
142 RR F2,B,C,D,E,A,24
143 RR F2,E,A,B,C,D,26
144 RR F2,C,D,E,A,B,28
146 RR F2,A,B,C,D,E,30
147 RR F2,D,E,A,B,C,32
148 RR F2,B,C,D,E,A,34
149 RR F2,E,A,B,C,D,36
150 RR F2,C,D,E,A,B,38
152 RR F3,A,B,C,D,E,40
153 RR F3,D,E,A,B,C,42
154 RR F3,B,C,D,E,A,44
155 RR F3,E,A,B,C,D,46
156 RR F3,C,D,E,A,B,48
158 RR F3,A,B,C,D,E,50
159 RR F3,D,E,A,B,C,52
160 RR F3,B,C,D,E,A,54
161 RR F3,E,A,B,C,D,56
162 RR F3,C,D,E,A,B,58
164 add $64, BUFFER_PTR # move to the next 64-byte block
168 RR F4,A,B,C,D,E,60
169 RR F4,D,E,A,B,C,62
170 RR F4,B,C,D,E,A,64
171 RR F4,E,A,B,C,D,66
172 RR F4,C,D,E,A,B,68
174 RR F4,A,B,C,D,E,70
175 RR F4,D,E,A,B,C,72
176 RR F4,B,C,D,E,A,74
177 RR F4,E,A,B,C,D,76
178 RR F4,C,D,E,A,B,78
183 UPDATE_HASH 12(HASH_PTR), D
195 .set D, REG_D define
204 mov D, REG_D
215 .macro F1 b, c, d argument
218 xor \d, T1
220 xor \d, T1
223 .macro F2 b, c, d argument
224 mov \d, T1
225 SWAP_REG_NAMES \d, T1
230 .macro F3 b, c ,d argument
236 and \d, T1
240 .macro F4 b, c, d argument
241 F2 \b, \c, \d
250 * RR does two rounds of SHA-1 back to back with W[] pre-calc
251 * t1 = F(b, c, d); e += w(i)
252 * e += t1; b <<= 30; d += w(i+1);
254 * d += t1; a <<= 5;
258 * d += t1;
260 .macro RR F, a, b, c, d, e, round
262 \F \b, \c, \d # t1 = F(b, c, d);
266 add WK(\round + 1), \d
272 add T1, \d
279 add T1, \d
282 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
299 .set i, ((\r) % 80) # pre-compute for the next iteration
306 .elseif (i < 80) // rounds 32-79
347 /* message scheduling pre-compute for rounds 0-15 */
349 .if ((i & 3) == 0)
351 .elseif ((i & 3) == 1)
354 .elseif ((i & 3) == 2)
356 .elseif ((i & 3) == 3)
357 movdqa W_TMP1, WK(i&~3)
362 /* message scheduling pre-compute for rounds 16-31
364 * - calculating last 32 w[i] values in 8 XMM registers
365 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
368 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
369 * dependency, but improves for 32-79
372 # blended scheduling of vector and scalar instruction streams, one 4-wide
374 .if ((i & 3) == 0)
376 palignr $8, W_minus_16, W # w[i-14]
378 psrldq $4, W_TMP1 # w[i-3]
380 .elseif ((i & 3) == 1)
386 .elseif ((i & 3) == 2)
393 .elseif ((i & 3) == 3)
398 movdqa W_TMP1, WK(i&~3)
403 /* message scheduling pre-compute for rounds 32-79
405 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
406 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
407 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
410 .if ((i & 3) == 0)
414 .elseif ((i & 3) == 1)
418 .elseif ((i & 3) == 2)
422 .elseif ((i & 3) == 3)
425 movdqa W_TMP1, WK(i&~3)
487 .if ((i & 3) == 0)
489 .elseif ((i & 3) == 1)
491 .elseif ((i & 3) == 2)
493 .elseif ((i & 3) == 3)
494 vmovdqa W_TMP1, WK(i&~3)
500 .if ((i & 3) == 0)
501 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
502 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
505 .elseif ((i & 3) == 1)
509 .elseif ((i & 3) == 2)
514 .elseif ((i & 3) == 3)
518 vmovdqu W_TMP1, WK(i&~3)
524 .if ((i & 3) == 0)
527 .elseif ((i & 3) == 1)
530 .elseif ((i & 3) == 2)
534 .elseif ((i & 3) == 3)
536 vmovdqu W_TMP1, WK(i&~3)