1######################################################################## 2# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48# This code schedules 2 blocks at a time, with 4 lanes per block 49######################################################################## 50 51#ifdef CONFIG_AS_AVX2 52#include <linux/linkage.h> 53 54## assume buffers not aligned 55#define VMOVDQ vmovdqu 56 57################################ Define Macros 58 59# addm [mem], reg 60# Add reg to mem using reg-mem add and store 61.macro addm p1 p2 62 add \p1, \p2 63 mov \p2, \p1 64.endm 65 66################################ 67 68X0 = %ymm4 69X1 = %ymm5 70X2 = %ymm6 71X3 = %ymm7 72 73# XMM versions of above 74XWORD0 = %xmm4 75XWORD1 = %xmm5 76XWORD2 = %xmm6 77XWORD3 = %xmm7 78 79XTMP0 = %ymm0 80XTMP1 = %ymm1 81XTMP2 = %ymm2 82XTMP3 = %ymm3 83XTMP4 = %ymm8 84XFER = %ymm9 85XTMP5 = %ymm11 86 87SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 88SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 89BYTE_FLIP_MASK = %ymm13 90 91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 92 93NUM_BLKS = %rdx # 3rd arg 94INP = %rsi # 2nd arg 95CTX = %rdi # 1st arg 96c = %ecx 97d = %r8d 98e = %edx # clobbers NUM_BLKS 99y3 = %esi # clobbers INP 100 101 102TBL = %rbp 103SRND = CTX # SRND is same register as CTX 104 105a = %eax 106b = %ebx 107f = %r9d 108g = %r10d 109h = %r11d 110old_h = %r11d 111 112T1 = %r12d 113y0 = %r13d 114y1 = %r14d 115y2 = %r15d 116 117 118_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 119_XMM_SAVE_SIZE = 0 120_INP_END_SIZE = 8 121_INP_SIZE = 8 122_CTX_SIZE = 8 123_RSP_SIZE = 8 124 125_XFER = 0 126_XMM_SAVE = _XFER + _XFER_SIZE 127_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 128_INP = _INP_END + _INP_END_SIZE 129_CTX = _INP + _INP_SIZE 130_RSP = _CTX + _CTX_SIZE 131STACK_SIZE = _RSP + _RSP_SIZE 132 133# rotate_Xs 134# Rotate values of symbols X0...X3 135.macro rotate_Xs 136 X_ = X0 137 X0 = X1 138 X1 = X2 139 X2 = X3 140 X3 = X_ 141.endm 142 143# ROTATE_ARGS 144# Rotate values of symbols a...h 145.macro ROTATE_ARGS 146 old_h = h 147 TMP_ = h 148 h = g 149 g = f 150 f = e 151 e = d 152 d = c 153 c = b 154 b = a 155 a = TMP_ 156.endm 157 158.macro FOUR_ROUNDS_AND_SCHED disp 159################################### RND N + 0 ############################ 160 161 mov a, y3 # y3 = a # MAJA 162 rorx $25, e, y0 # y0 = e >> 25 # S1A 163 rorx $11, e, y1 # y1 = e >> 11 # S1B 164 165 addl \disp(%rsp, SRND), h # h = k + w + h # -- 166 or c, y3 # y3 = a|c # MAJA 167 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 168 mov f, y2 # y2 = f # CH 169 rorx $13, a, T1 # T1 = a >> 13 # S0B 170 171 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 172 xor g, y2 # y2 = f^g # CH 173 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 174 rorx $6, e, y1 # y1 = (e >> 6) # S1 175 176 and e, y2 # y2 = (f^g)&e # CH 177 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 178 rorx $22, a, y1 # y1 = a >> 22 # S0A 179 add h, d # d = k + w + h + d # -- 180 181 and b, y3 # y3 = (a|c)&b # MAJA 182 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 183 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 184 rorx $2, a, T1 # T1 = (a >> 2) # S0 185 186 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 187 vpsrld $7, XTMP1, XTMP2 188 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 189 mov a, T1 # T1 = a # MAJB 190 and c, T1 # T1 = a&c # MAJB 191 192 add y0, y2 # y2 = S1 + CH # -- 193 vpslld $(32-7), XTMP1, XTMP3 194 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 195 add y1, h # h = k + w + h + S0 # -- 196 197 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 198 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 199 200 vpsrld $18, XTMP1, XTMP2 201 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 202 add y3, h # h = t1 + S0 + MAJ # -- 203 204 205 ROTATE_ARGS 206 207################################### RND N + 1 ############################ 208 209 mov a, y3 # y3 = a # MAJA 210 rorx $25, e, y0 # y0 = e >> 25 # S1A 211 rorx $11, e, y1 # y1 = e >> 11 # S1B 212 offset = \disp + 1*4 213 addl offset(%rsp, SRND), h # h = k + w + h # -- 214 or c, y3 # y3 = a|c # MAJA 215 216 217 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 218 mov f, y2 # y2 = f # CH 219 rorx $13, a, T1 # T1 = a >> 13 # S0B 220 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 221 xor g, y2 # y2 = f^g # CH 222 223 224 rorx $6, e, y1 # y1 = (e >> 6) # S1 225 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 226 rorx $22, a, y1 # y1 = a >> 22 # S0A 227 and e, y2 # y2 = (f^g)&e # CH 228 add h, d # d = k + w + h + d # -- 229 230 vpslld $(32-18), XTMP1, XTMP1 231 and b, y3 # y3 = (a|c)&b # MAJA 232 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 233 234 vpxor XTMP1, XTMP3, XTMP3 235 rorx $2, a, T1 # T1 = (a >> 2) # S0 236 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 237 238 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 239 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 240 mov a, T1 # T1 = a # MAJB 241 and c, T1 # T1 = a&c # MAJB 242 add y0, y2 # y2 = S1 + CH # -- 243 244 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 245 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 246 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 247 add y1, h # h = k + w + h + S0 # -- 248 249 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 250 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 251 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 252 add y3, h # h = t1 + S0 + MAJ # -- 253 254 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 255 256 257 ROTATE_ARGS 258 259################################### RND N + 2 ############################ 260 261 mov a, y3 # y3 = a # MAJA 262 rorx $25, e, y0 # y0 = e >> 25 # S1A 263 offset = \disp + 2*4 264 addl offset(%rsp, SRND), h # h = k + w + h # -- 265 266 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 267 rorx $11, e, y1 # y1 = e >> 11 # S1B 268 or c, y3 # y3 = a|c # MAJA 269 mov f, y2 # y2 = f # CH 270 xor g, y2 # y2 = f^g # CH 271 272 rorx $13, a, T1 # T1 = a >> 13 # S0B 273 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 274 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 275 and e, y2 # y2 = (f^g)&e # CH 276 277 rorx $6, e, y1 # y1 = (e >> 6) # S1 278 vpxor XTMP3, XTMP2, XTMP2 279 add h, d # d = k + w + h + d # -- 280 and b, y3 # y3 = (a|c)&b # MAJA 281 282 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 283 rorx $22, a, y1 # y1 = a >> 22 # S0A 284 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 285 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 286 287 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 289 rorx $2, a ,T1 # T1 = (a >> 2) # S0 290 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 291 292 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 293 mov a, T1 # T1 = a # MAJB 294 and c, T1 # T1 = a&c # MAJB 295 add y0, y2 # y2 = S1 + CH # -- 296 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 297 298 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 299 add y1,h # h = k + w + h + S0 # -- 300 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 301 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 302 303 add y3,h # h = t1 + S0 + MAJ # -- 304 305 306 ROTATE_ARGS 307 308################################### RND N + 3 ############################ 309 310 mov a, y3 # y3 = a # MAJA 311 rorx $25, e, y0 # y0 = e >> 25 # S1A 312 rorx $11, e, y1 # y1 = e >> 11 # S1B 313 offset = \disp + 3*4 314 addl offset(%rsp, SRND), h # h = k + w + h # -- 315 or c, y3 # y3 = a|c # MAJA 316 317 318 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 319 mov f, y2 # y2 = f # CH 320 rorx $13, a, T1 # T1 = a >> 13 # S0B 321 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 322 xor g, y2 # y2 = f^g # CH 323 324 325 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 326 rorx $6, e, y1 # y1 = (e >> 6) # S1 327 and e, y2 # y2 = (f^g)&e # CH 328 add h, d # d = k + w + h + d # -- 329 and b, y3 # y3 = (a|c)&b # MAJA 330 331 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 332 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 333 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 334 335 vpxor XTMP3, XTMP2, XTMP2 336 rorx $22, a, y1 # y1 = a >> 22 # S0A 337 add y0, y2 # y2 = S1 + CH # -- 338 339 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 340 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 341 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 342 343 rorx $2, a, T1 # T1 = (a >> 2) # S0 344 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 345 346 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 347 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 348 mov a, T1 # T1 = a # MAJB 349 and c, T1 # T1 = a&c # MAJB 350 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 351 352 add y1, h # h = k + w + h + S0 # -- 353 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 354 add y3, h # h = t1 + S0 + MAJ # -- 355 356 ROTATE_ARGS 357 rotate_Xs 358.endm 359 360.macro DO_4ROUNDS disp 361################################### RND N + 0 ########################### 362 363 mov f, y2 # y2 = f # CH 364 rorx $25, e, y0 # y0 = e >> 25 # S1A 365 rorx $11, e, y1 # y1 = e >> 11 # S1B 366 xor g, y2 # y2 = f^g # CH 367 368 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 369 rorx $6, e, y1 # y1 = (e >> 6) # S1 370 and e, y2 # y2 = (f^g)&e # CH 371 372 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 373 rorx $13, a, T1 # T1 = a >> 13 # S0B 374 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 375 rorx $22, a, y1 # y1 = a >> 22 # S0A 376 mov a, y3 # y3 = a # MAJA 377 378 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 379 rorx $2, a, T1 # T1 = (a >> 2) # S0 380 addl \disp(%rsp, SRND), h # h = k + w + h # -- 381 or c, y3 # y3 = a|c # MAJA 382 383 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 384 mov a, T1 # T1 = a # MAJB 385 and b, y3 # y3 = (a|c)&b # MAJA 386 and c, T1 # T1 = a&c # MAJB 387 add y0, y2 # y2 = S1 + CH # -- 388 389 390 add h, d # d = k + w + h + d # -- 391 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 392 add y1, h # h = k + w + h + S0 # -- 393 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 394 395 ROTATE_ARGS 396 397################################### RND N + 1 ########################### 398 399 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 400 mov f, y2 # y2 = f # CH 401 rorx $25, e, y0 # y0 = e >> 25 # S1A 402 rorx $11, e, y1 # y1 = e >> 11 # S1B 403 xor g, y2 # y2 = f^g # CH 404 405 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 406 rorx $6, e, y1 # y1 = (e >> 6) # S1 407 and e, y2 # y2 = (f^g)&e # CH 408 add y3, old_h # h = t1 + S0 + MAJ # -- 409 410 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 411 rorx $13, a, T1 # T1 = a >> 13 # S0B 412 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 413 rorx $22, a, y1 # y1 = a >> 22 # S0A 414 mov a, y3 # y3 = a # MAJA 415 416 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 417 rorx $2, a, T1 # T1 = (a >> 2) # S0 418 offset = 4*1 + \disp 419 addl offset(%rsp, SRND), h # h = k + w + h # -- 420 or c, y3 # y3 = a|c # MAJA 421 422 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 423 mov a, T1 # T1 = a # MAJB 424 and b, y3 # y3 = (a|c)&b # MAJA 425 and c, T1 # T1 = a&c # MAJB 426 add y0, y2 # y2 = S1 + CH # -- 427 428 429 add h, d # d = k + w + h + d # -- 430 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 431 add y1, h # h = k + w + h + S0 # -- 432 433 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 434 435 ROTATE_ARGS 436 437################################### RND N + 2 ############################## 438 439 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 440 mov f, y2 # y2 = f # CH 441 rorx $25, e, y0 # y0 = e >> 25 # S1A 442 rorx $11, e, y1 # y1 = e >> 11 # S1B 443 xor g, y2 # y2 = f^g # CH 444 445 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 446 rorx $6, e, y1 # y1 = (e >> 6) # S1 447 and e, y2 # y2 = (f^g)&e # CH 448 add y3, old_h # h = t1 + S0 + MAJ # -- 449 450 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 451 rorx $13, a, T1 # T1 = a >> 13 # S0B 452 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 453 rorx $22, a, y1 # y1 = a >> 22 # S0A 454 mov a, y3 # y3 = a # MAJA 455 456 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 457 rorx $2, a, T1 # T1 = (a >> 2) # S0 458 offset = 4*2 + \disp 459 addl offset(%rsp, SRND), h # h = k + w + h # -- 460 or c, y3 # y3 = a|c # MAJA 461 462 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 463 mov a, T1 # T1 = a # MAJB 464 and b, y3 # y3 = (a|c)&b # MAJA 465 and c, T1 # T1 = a&c # MAJB 466 add y0, y2 # y2 = S1 + CH # -- 467 468 469 add h, d # d = k + w + h + d # -- 470 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 471 add y1, h # h = k + w + h + S0 # -- 472 473 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 474 475 ROTATE_ARGS 476 477################################### RND N + 3 ########################### 478 479 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 480 mov f, y2 # y2 = f # CH 481 rorx $25, e, y0 # y0 = e >> 25 # S1A 482 rorx $11, e, y1 # y1 = e >> 11 # S1B 483 xor g, y2 # y2 = f^g # CH 484 485 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 486 rorx $6, e, y1 # y1 = (e >> 6) # S1 487 and e, y2 # y2 = (f^g)&e # CH 488 add y3, old_h # h = t1 + S0 + MAJ # -- 489 490 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 491 rorx $13, a, T1 # T1 = a >> 13 # S0B 492 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 493 rorx $22, a, y1 # y1 = a >> 22 # S0A 494 mov a, y3 # y3 = a # MAJA 495 496 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 497 rorx $2, a, T1 # T1 = (a >> 2) # S0 498 offset = 4*3 + \disp 499 addl offset(%rsp, SRND), h # h = k + w + h # -- 500 or c, y3 # y3 = a|c # MAJA 501 502 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 503 mov a, T1 # T1 = a # MAJB 504 and b, y3 # y3 = (a|c)&b # MAJA 505 and c, T1 # T1 = a&c # MAJB 506 add y0, y2 # y2 = S1 + CH # -- 507 508 509 add h, d # d = k + w + h + d # -- 510 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 511 add y1, h # h = k + w + h + S0 # -- 512 513 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 514 515 516 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 517 518 add y3, h # h = t1 + S0 + MAJ # -- 519 520 ROTATE_ARGS 521 522.endm 523 524######################################################################## 525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 526## arg 1 : pointer to digest 527## arg 2 : pointer to input data 528## arg 3 : Num blocks 529######################################################################## 530.text 531ENTRY(sha256_transform_rorx) 532.align 32 533 pushq %rbx 534 pushq %rbp 535 pushq %r12 536 pushq %r13 537 pushq %r14 538 pushq %r15 539 540 mov %rsp, %rax 541 subq $STACK_SIZE, %rsp 542 and $-32, %rsp # align rsp to 32 byte boundary 543 mov %rax, _RSP(%rsp) 544 545 546 shl $6, NUM_BLKS # convert to bytes 547 jz done_hash 548 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 549 mov NUM_BLKS, _INP_END(%rsp) 550 551 cmp NUM_BLKS, INP 552 je only_one_block 553 554 ## load initial digest 555 mov (CTX), a 556 mov 4*1(CTX), b 557 mov 4*2(CTX), c 558 mov 4*3(CTX), d 559 mov 4*4(CTX), e 560 mov 4*5(CTX), f 561 mov 4*6(CTX), g 562 mov 4*7(CTX), h 563 564 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 565 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 566 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 567 568 mov CTX, _CTX(%rsp) 569 570loop0: 571 lea K256(%rip), TBL 572 573 ## Load first 16 dwords from two blocks 574 VMOVDQ 0*32(INP),XTMP0 575 VMOVDQ 1*32(INP),XTMP1 576 VMOVDQ 2*32(INP),XTMP2 577 VMOVDQ 3*32(INP),XTMP3 578 579 ## byte swap data 580 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 581 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 582 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 583 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 584 585 ## transpose data into high/low halves 586 vperm2i128 $0x20, XTMP2, XTMP0, X0 587 vperm2i128 $0x31, XTMP2, XTMP0, X1 588 vperm2i128 $0x20, XTMP3, XTMP1, X2 589 vperm2i128 $0x31, XTMP3, XTMP1, X3 590 591last_block_enter: 592 add $64, INP 593 mov INP, _INP(%rsp) 594 595 ## schedule 48 input dwords, by doing 3 rounds of 12 each 596 xor SRND, SRND 597 598.align 16 599loop1: 600 vpaddd 0*32(TBL, SRND), X0, XFER 601 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 602 FOUR_ROUNDS_AND_SCHED _XFER + 0*32 603 604 vpaddd 1*32(TBL, SRND), X0, XFER 605 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 606 FOUR_ROUNDS_AND_SCHED _XFER + 1*32 607 608 vpaddd 2*32(TBL, SRND), X0, XFER 609 vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 610 FOUR_ROUNDS_AND_SCHED _XFER + 2*32 611 612 vpaddd 3*32(TBL, SRND), X0, XFER 613 vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 614 FOUR_ROUNDS_AND_SCHED _XFER + 3*32 615 616 add $4*32, SRND 617 cmp $3*4*32, SRND 618 jb loop1 619 620loop2: 621 ## Do last 16 rounds with no scheduling 622 vpaddd 0*32(TBL, SRND), X0, XFER 623 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 624 DO_4ROUNDS _XFER + 0*32 625 vpaddd 1*32(TBL, SRND), X1, XFER 626 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 627 DO_4ROUNDS _XFER + 1*32 628 add $2*32, SRND 629 630 vmovdqa X2, X0 631 vmovdqa X3, X1 632 633 cmp $4*4*32, SRND 634 jb loop2 635 636 mov _CTX(%rsp), CTX 637 mov _INP(%rsp), INP 638 639 addm (4*0)(CTX),a 640 addm (4*1)(CTX),b 641 addm (4*2)(CTX),c 642 addm (4*3)(CTX),d 643 addm (4*4)(CTX),e 644 addm (4*5)(CTX),f 645 addm (4*6)(CTX),g 646 addm (4*7)(CTX),h 647 648 cmp _INP_END(%rsp), INP 649 ja done_hash 650 651 #### Do second block using previously scheduled results 652 xor SRND, SRND 653.align 16 654loop3: 655 DO_4ROUNDS _XFER + 0*32 + 16 656 DO_4ROUNDS _XFER + 1*32 + 16 657 add $2*32, SRND 658 cmp $4*4*32, SRND 659 jb loop3 660 661 mov _CTX(%rsp), CTX 662 mov _INP(%rsp), INP 663 add $64, INP 664 665 addm (4*0)(CTX),a 666 addm (4*1)(CTX),b 667 addm (4*2)(CTX),c 668 addm (4*3)(CTX),d 669 addm (4*4)(CTX),e 670 addm (4*5)(CTX),f 671 addm (4*6)(CTX),g 672 addm (4*7)(CTX),h 673 674 cmp _INP_END(%rsp), INP 675 jb loop0 676 ja done_hash 677 678do_last_block: 679 #### do last block 680 lea K256(%rip), TBL 681 682 VMOVDQ 0*16(INP),XWORD0 683 VMOVDQ 1*16(INP),XWORD1 684 VMOVDQ 2*16(INP),XWORD2 685 VMOVDQ 3*16(INP),XWORD3 686 687 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 688 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 689 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 690 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 691 692 jmp last_block_enter 693 694only_one_block: 695 696 ## load initial digest 697 mov (4*0)(CTX),a 698 mov (4*1)(CTX),b 699 mov (4*2)(CTX),c 700 mov (4*3)(CTX),d 701 mov (4*4)(CTX),e 702 mov (4*5)(CTX),f 703 mov (4*6)(CTX),g 704 mov (4*7)(CTX),h 705 706 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 707 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 708 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 709 710 mov CTX, _CTX(%rsp) 711 jmp do_last_block 712 713done_hash: 714 715 mov _RSP(%rsp), %rsp 716 717 popq %r15 718 popq %r14 719 popq %r13 720 popq %r12 721 popq %rbp 722 popq %rbx 723 ret 724ENDPROC(sha256_transform_rorx) 725 726.section .rodata.cst512.K256, "aM", @progbits, 512 727.align 64 728K256: 729 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 730 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 731 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 732 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 733 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 734 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 735 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 736 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 737 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 738 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 739 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 740 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 741 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 742 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 743 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 744 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 745 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 746 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 747 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 748 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 749 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 750 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 751 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 752 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 753 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 754 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 755 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 756 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 757 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 758 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 759 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 760 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 761 762.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 763.align 32 764PSHUFFLE_BYTE_FLIP_MASK: 765 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 766 767# shuffle xBxA -> 00BA 768.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 769.align 32 770_SHUF_00BA: 771 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 772 773# shuffle xDxC -> DC00 774.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 775.align 32 776_SHUF_DC00: 777 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 778 779#endif 780