Lines Matching +full:- +full:16 +full:g
2 # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
58 # Add reg to mem using reg-mem add and store
87 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
104 g = %r10d define
115 _XFER_SIZE = 16
138 h = g
139 g = f define
150 ## compute W[-16] + W[-7] 4 at a time
153 ror $(25-11), y0 # y0 = e >> (25-11)
155 palignr $4, X2, XTMP0 # XTMP0 = W[-7]
156 ror $(22-13), y1 # y1 = a >> (22-13)
157 xor e, y0 # y0 = e ^ (e >> (25-11))
159 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
161 xor a, y1 # y1 = a ^ (a >> (22-13)
162 xor g, y2 # y2 = f^g
163 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
164 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
165 and e, y2 # y2 = (f^g)&e
166 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
168 palignr $4, X0, XTMP1 # XTMP1 = W[-15]
169 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
171 xor g, y2 # y2 = CH = ((f^g)&e)^g
172 movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
176 movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
180 pslld $(32-7), XTMP1 #
187 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
192 movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
195 movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
196 ror $(25-11), y0 # y0 = e >> (25-11)
197 xor e, y0 # y0 = e ^ (e >> (25-11))
199 ror $(22-13), y1 # y1 = a >> (22-13)
200 pslld $(32-18), XTMP3 #
201 xor a, y1 # y1 = a ^ (a >> (22-13)
202 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
203 xor g, y2 # y2 = f^g
205 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 and e, y2 # y2 = (f^g)&e
210 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 xor g, y2 # y2 = CH = ((f^g)&e)^g
212 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
216 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
225 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
228 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
233 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
236 ror $(25-11), y0 # y0 = e >> (25-11)
237 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
238 xor e, y0 # y0 = e ^ (e >> (25-11))
239 ror $(22-13), y1 # y1 = a >> (22-13)
241 xor a, y1 # y1 = a ^ (a >> (22-13)
242 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
243 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
244 xor g, y2 # y2 = f^g
245 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
246 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
247 and e, y2 # y2 = (f^g)&e
248 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
249 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
250 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
251 xor g, y2 # y2 = CH = ((f^g)&e)^g
269 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
274 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
276 ror $(25-11), y0 # y0 = e >> (25-11)
278 movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
279 ror $(22-13), y1 # y1 = a >> (22-13)
280 xor e, y0 # y0 = e ^ (e >> (25-11))
282 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
283 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
284 xor a, y1 # y1 = a ^ (a >> (22-13)
285 xor g, y2 # y2 = f^g
286 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
287 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
288 and e, y2 # y2 = (f^g)&e
289 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
290 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
291 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
293 xor g, y2 # y2 = CH = ((f^g)&e)^g
319 ror $(25-11), y0 # y0 = e >> (25-11)
321 xor e, y0 # y0 = e ^ (e >> (25-11))
322 ror $(22-13), y1 # y1 = a >> (22-13)
324 xor a, y1 # y1 = a ^ (a >> (22-13)
325 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
326 xor g, y2 # y2 = f^g
327 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
328 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
329 and e, y2 # y2 = (f^g)&e
330 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
332 xor g, y2 # y2 = CH = ((f^g)&e)^g
383 mov 4*6(CTX), g
393 ## byte swap first 16 dwords
394 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
395 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
396 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
397 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
401 ## schedule 48 input dwords, by doing 3 rounds of 16 each
403 .align 16
410 movdqa 1*16(TBL), XFER
415 movdqa 2*16(TBL), XFER
420 movdqa 3*16(TBL), XFER
423 add $4*16, TBL
437 paddd 1*16(TBL), X1
439 add $2*16, TBL
457 addm (4*6)(CTX),g
498 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
499 .align 16
503 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
504 .align 16
505 # shuffle xBxA -> 00BA
509 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
510 .align 16
511 # shuffle xDxC -> DC00