1######################################################################## 2# Implement fast SHA-256 with SSSE3 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48 49#include <linux/linkage.h> 50 51## assume buffers not aligned 52#define MOVDQ movdqu 53 54################################ Define Macros 55 56# addm [mem], reg 57# Add reg to mem using reg-mem add and store 58.macro addm p1 p2 59 add \p1, \p2 60 mov \p2, \p1 61.endm 62 63################################ 64 65# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 66# Load xmm with mem and byte swap each dword 67.macro COPY_XMM_AND_BSWAP p1 p2 p3 68 MOVDQ \p2, \p1 69 pshufb \p3, \p1 70.endm 71 72################################ 73 74X0 = %xmm4 75X1 = %xmm5 76X2 = %xmm6 77X3 = %xmm7 78 79XTMP0 = %xmm0 80XTMP1 = %xmm1 81XTMP2 = %xmm2 82XTMP3 = %xmm3 83XTMP4 = %xmm8 84XFER = %xmm9 85 86SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 87SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 88BYTE_FLIP_MASK = %xmm12 89 90NUM_BLKS = %rdx # 3rd arg 91INP = %rsi # 2nd arg 92CTX = %rdi # 1st arg 93 94SRND = %rsi # clobbers INP 95c = %ecx 96d = %r8d 97e = %edx 98TBL = %r12 99a = %eax 100b = %ebx 101 102f = %r9d 103g = %r10d 104h = %r11d 105 106y0 = %r13d 107y1 = %r14d 108y2 = %r15d 109 110 111 112_INP_END_SIZE = 8 113_INP_SIZE = 8 114_XFER_SIZE = 16 115_XMM_SAVE_SIZE = 0 116 117_INP_END = 0 118_INP = _INP_END + _INP_END_SIZE 119_XFER = _INP + _INP_SIZE 120_XMM_SAVE = _XFER + _XFER_SIZE 121STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 122 123# rotate_Xs 124# Rotate values of symbols X0...X3 125.macro rotate_Xs 126X_ = X0 127X0 = X1 128X1 = X2 129X2 = X3 130X3 = X_ 131.endm 132 133# ROTATE_ARGS 134# Rotate values of symbols a...h 135.macro ROTATE_ARGS 136TMP_ = h 137h = g 138g = f 139f = e 140e = d 141d = c 142c = b 143b = a 144a = TMP_ 145.endm 146 147.macro FOUR_ROUNDS_AND_SCHED 148 ## compute s0 four at a time and s1 two at a time 149 ## compute W[-16] + W[-7] 4 at a time 150 movdqa X3, XTMP0 151 mov e, y0 # y0 = e 152 ror $(25-11), y0 # y0 = e >> (25-11) 153 mov a, y1 # y1 = a 154 palignr $4, X2, XTMP0 # XTMP0 = W[-7] 155 ror $(22-13), y1 # y1 = a >> (22-13) 156 xor e, y0 # y0 = e ^ (e >> (25-11)) 157 mov f, y2 # y2 = f 158 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 159 movdqa X1, XTMP1 160 xor a, y1 # y1 = a ^ (a >> (22-13) 161 xor g, y2 # y2 = f^g 162 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] 163 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 164 and e, y2 # y2 = (f^g)&e 165 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 166 ## compute s0 167 palignr $4, X0, XTMP1 # XTMP1 = W[-15] 168 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 169 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 170 xor g, y2 # y2 = CH = ((f^g)&e)^g 171 movdqa XTMP1, XTMP2 # XTMP2 = W[-15] 172 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 173 add y0, y2 # y2 = S1 + CH 174 add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH 175 movdqa XTMP1, XTMP3 # XTMP3 = W[-15] 176 mov a, y0 # y0 = a 177 add y2, h # h = h + S1 + CH + k + w 178 mov a, y2 # y2 = a 179 pslld $(32-7), XTMP1 # 180 or c, y0 # y0 = a|c 181 add h, d # d = d + h + S1 + CH + k + w 182 and c, y2 # y2 = a&c 183 psrld $7, XTMP2 # 184 and b, y0 # y0 = (a|c)&b 185 add y1, h # h = h + S1 + CH + k + w + S0 186 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 187 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 188 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 189 # 190 ROTATE_ARGS # 191 movdqa XTMP3, XTMP2 # XTMP2 = W[-15] 192 mov e, y0 # y0 = e 193 mov a, y1 # y1 = a 194 movdqa XTMP3, XTMP4 # XTMP4 = W[-15] 195 ror $(25-11), y0 # y0 = e >> (25-11) 196 xor e, y0 # y0 = e ^ (e >> (25-11)) 197 mov f, y2 # y2 = f 198 ror $(22-13), y1 # y1 = a >> (22-13) 199 pslld $(32-18), XTMP3 # 200 xor a, y1 # y1 = a ^ (a >> (22-13) 201 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 202 xor g, y2 # y2 = f^g 203 psrld $18, XTMP2 # 204 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 206 and e, y2 # y2 = (f^g)&e 207 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 208 pxor XTMP3, XTMP1 209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 210 xor g, y2 # y2 = CH = ((f^g)&e)^g 211 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 212 add y0, y2 # y2 = S1 + CH 213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 214 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 215 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 216 mov a, y0 # y0 = a 217 add y2, h # h = h + S1 + CH + k + w 218 mov a, y2 # y2 = a 219 pxor XTMP4, XTMP1 # XTMP1 = s0 220 or c, y0 # y0 = a|c 221 add h, d # d = d + h + S1 + CH + k + w 222 and c, y2 # y2 = a&c 223 ## compute low s1 224 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 225 and b, y0 # y0 = (a|c)&b 226 add y1, h # h = h + S1 + CH + k + w + S0 227 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 230 231 ROTATE_ARGS 232 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} 233 mov e, y0 # y0 = e 234 mov a, y1 # y1 = a 235 ror $(25-11), y0 # y0 = e >> (25-11) 236 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} 237 xor e, y0 # y0 = e ^ (e >> (25-11)) 238 ror $(22-13), y1 # y1 = a >> (22-13) 239 mov f, y2 # y2 = f 240 xor a, y1 # y1 = a ^ (a >> (22-13) 241 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 242 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 243 xor g, y2 # y2 = f^g 244 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 245 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 246 and e, y2 # y2 = (f^g)&e 247 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 248 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 249 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 250 xor g, y2 # y2 = CH = ((f^g)&e)^g 251 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 252 pxor XTMP3, XTMP2 253 add y0, y2 # y2 = S1 + CH 254 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 255 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 256 pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} 257 mov a, y0 # y0 = a 258 add y2, h # h = h + S1 + CH + k + w 259 mov a, y2 # y2 = a 260 pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} 261 or c, y0 # y0 = a|c 262 add h, d # d = d + h + S1 + CH + k + w 263 and c, y2 # y2 = a&c 264 paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 265 and b, y0 # y0 = (a|c)&b 266 add y1, h # h = h + S1 + CH + k + w + S0 267 ## compute high s1 268 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} 269 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 270 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 271 # 272 ROTATE_ARGS # 273 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} 274 mov e, y0 # y0 = e 275 ror $(25-11), y0 # y0 = e >> (25-11) 276 mov a, y1 # y1 = a 277 movdqa XTMP2, X0 # X0 = W[-2] {DDCC} 278 ror $(22-13), y1 # y1 = a >> (22-13) 279 xor e, y0 # y0 = e ^ (e >> (25-11)) 280 mov f, y2 # y2 = f 281 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 282 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 283 xor a, y1 # y1 = a ^ (a >> (22-13) 284 xor g, y2 # y2 = f^g 285 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 286 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 287 and e, y2 # y2 = (f^g)&e 288 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 289 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} 290 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 291 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 292 xor g, y2 # y2 = CH = ((f^g)&e)^g 293 pxor XTMP3, XTMP2 # 294 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 295 add y0, y2 # y2 = S1 + CH 296 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 297 pxor XTMP2, X0 # X0 = s1 {xDxC} 298 mov a, y0 # y0 = a 299 add y2, h # h = h + S1 + CH + k + w 300 mov a, y2 # y2 = a 301 pshufb SHUF_DC00, X0 # X0 = s1 {DC00} 302 or c, y0 # y0 = a|c 303 add h, d # d = d + h + S1 + CH + k + w 304 and c, y2 # y2 = a&c 305 paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} 306 and b, y0 # y0 = (a|c)&b 307 add y1, h # h = h + S1 + CH + k + w + S0 308 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 309 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 310 311 ROTATE_ARGS 312 rotate_Xs 313.endm 314 315## input is [rsp + _XFER + %1 * 4] 316.macro DO_ROUND round 317 mov e, y0 # y0 = e 318 ror $(25-11), y0 # y0 = e >> (25-11) 319 mov a, y1 # y1 = a 320 xor e, y0 # y0 = e ^ (e >> (25-11)) 321 ror $(22-13), y1 # y1 = a >> (22-13) 322 mov f, y2 # y2 = f 323 xor a, y1 # y1 = a ^ (a >> (22-13) 324 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 325 xor g, y2 # y2 = f^g 326 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 327 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 328 and e, y2 # y2 = (f^g)&e 329 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 330 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 331 xor g, y2 # y2 = CH = ((f^g)&e)^g 332 add y0, y2 # y2 = S1 + CH 333 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 334 offset = \round * 4 + _XFER 335 add offset(%rsp), y2 # y2 = k + w + S1 + CH 336 mov a, y0 # y0 = a 337 add y2, h # h = h + S1 + CH + k + w 338 mov a, y2 # y2 = a 339 or c, y0 # y0 = a|c 340 add h, d # d = d + h + S1 + CH + k + w 341 and c, y2 # y2 = a&c 342 and b, y0 # y0 = (a|c)&b 343 add y1, h # h = h + S1 + CH + k + w + S0 344 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 345 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 346 ROTATE_ARGS 347.endm 348 349######################################################################## 350## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) 351## arg 1 : pointer to digest 352## arg 2 : pointer to input data 353## arg 3 : Num blocks 354######################################################################## 355.text 356ENTRY(sha256_transform_ssse3) 357.align 32 358 pushq %rbx 359 pushq %r12 360 pushq %r13 361 pushq %r14 362 pushq %r15 363 pushq %rbp 364 mov %rsp, %rbp 365 366 subq $STACK_SIZE, %rsp 367 and $~15, %rsp 368 369 shl $6, NUM_BLKS # convert to bytes 370 jz done_hash 371 add INP, NUM_BLKS 372 mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data 373 374 ## load initial digest 375 mov 4*0(CTX), a 376 mov 4*1(CTX), b 377 mov 4*2(CTX), c 378 mov 4*3(CTX), d 379 mov 4*4(CTX), e 380 mov 4*5(CTX), f 381 mov 4*6(CTX), g 382 mov 4*7(CTX), h 383 384 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 385 movdqa _SHUF_00BA(%rip), SHUF_00BA 386 movdqa _SHUF_DC00(%rip), SHUF_DC00 387 388loop0: 389 lea K256(%rip), TBL 390 391 ## byte swap first 16 dwords 392 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 393 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 394 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 395 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 396 397 mov INP, _INP(%rsp) 398 399 ## schedule 48 input dwords, by doing 3 rounds of 16 each 400 mov $3, SRND 401.align 16 402loop1: 403 movdqa (TBL), XFER 404 paddd X0, XFER 405 movdqa XFER, _XFER(%rsp) 406 FOUR_ROUNDS_AND_SCHED 407 408 movdqa 1*16(TBL), XFER 409 paddd X0, XFER 410 movdqa XFER, _XFER(%rsp) 411 FOUR_ROUNDS_AND_SCHED 412 413 movdqa 2*16(TBL), XFER 414 paddd X0, XFER 415 movdqa XFER, _XFER(%rsp) 416 FOUR_ROUNDS_AND_SCHED 417 418 movdqa 3*16(TBL), XFER 419 paddd X0, XFER 420 movdqa XFER, _XFER(%rsp) 421 add $4*16, TBL 422 FOUR_ROUNDS_AND_SCHED 423 424 sub $1, SRND 425 jne loop1 426 427 mov $2, SRND 428loop2: 429 paddd (TBL), X0 430 movdqa X0, _XFER(%rsp) 431 DO_ROUND 0 432 DO_ROUND 1 433 DO_ROUND 2 434 DO_ROUND 3 435 paddd 1*16(TBL), X1 436 movdqa X1, _XFER(%rsp) 437 add $2*16, TBL 438 DO_ROUND 0 439 DO_ROUND 1 440 DO_ROUND 2 441 DO_ROUND 3 442 443 movdqa X2, X0 444 movdqa X3, X1 445 446 sub $1, SRND 447 jne loop2 448 449 addm (4*0)(CTX),a 450 addm (4*1)(CTX),b 451 addm (4*2)(CTX),c 452 addm (4*3)(CTX),d 453 addm (4*4)(CTX),e 454 addm (4*5)(CTX),f 455 addm (4*6)(CTX),g 456 addm (4*7)(CTX),h 457 458 mov _INP(%rsp), INP 459 add $64, INP 460 cmp _INP_END(%rsp), INP 461 jne loop0 462 463done_hash: 464 465 mov %rbp, %rsp 466 popq %rbp 467 popq %r15 468 popq %r14 469 popq %r13 470 popq %r12 471 popq %rbx 472 473 ret 474ENDPROC(sha256_transform_ssse3) 475 476.section .rodata.cst256.K256, "aM", @progbits, 256 477.align 64 478K256: 479 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 480 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 481 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 482 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 483 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 484 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 485 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 486 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 487 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 488 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 489 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 490 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 491 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 492 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 493 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 494 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 495 496.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 497.align 16 498PSHUFFLE_BYTE_FLIP_MASK: 499 .octa 0x0c0d0e0f08090a0b0405060700010203 500 501.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 502.align 16 503# shuffle xBxA -> 00BA 504_SHUF_00BA: 505 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 506 507.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 508.align 16 509# shuffle xDxC -> DC00 510_SHUF_DC00: 511 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 512