1######################################################################## 2# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48# This code schedules 2 blocks at a time, with 4 lanes per block 49######################################################################## 50 51#ifdef CONFIG_AS_AVX2 52#include <linux/linkage.h> 53 54## assume buffers not aligned 55#define VMOVDQ vmovdqu 56 57################################ Define Macros 58 59# addm [mem], reg 60# Add reg to mem using reg-mem add and store 61.macro addm p1 p2 62 add \p1, \p2 63 mov \p2, \p1 64.endm 65 66################################ 67 68X0 = %ymm4 69X1 = %ymm5 70X2 = %ymm6 71X3 = %ymm7 72 73# XMM versions of above 74XWORD0 = %xmm4 75XWORD1 = %xmm5 76XWORD2 = %xmm6 77XWORD3 = %xmm7 78 79XTMP0 = %ymm0 80XTMP1 = %ymm1 81XTMP2 = %ymm2 82XTMP3 = %ymm3 83XTMP4 = %ymm8 84XFER = %ymm9 85XTMP5 = %ymm11 86 87SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 88SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 89BYTE_FLIP_MASK = %ymm13 90 91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 92 93NUM_BLKS = %rdx # 3rd arg 94INP = %rsi # 2nd arg 95CTX = %rdi # 1st arg 96c = %ecx 97d = %r8d 98e = %edx # clobbers NUM_BLKS 99y3 = %esi # clobbers INP 100 101SRND = CTX # SRND is same register as CTX 102 103a = %eax 104b = %ebx 105f = %r9d 106g = %r10d 107h = %r11d 108old_h = %r11d 109 110T1 = %r12d 111y0 = %r13d 112y1 = %r14d 113y2 = %r15d 114 115 116_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 117_XMM_SAVE_SIZE = 0 118_INP_END_SIZE = 8 119_INP_SIZE = 8 120_CTX_SIZE = 8 121_RSP_SIZE = 8 122 123_XFER = 0 124_XMM_SAVE = _XFER + _XFER_SIZE 125_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 126_INP = _INP_END + _INP_END_SIZE 127_CTX = _INP + _INP_SIZE 128_RSP = _CTX + _CTX_SIZE 129STACK_SIZE = _RSP + _RSP_SIZE 130 131# rotate_Xs 132# Rotate values of symbols X0...X3 133.macro rotate_Xs 134 X_ = X0 135 X0 = X1 136 X1 = X2 137 X2 = X3 138 X3 = X_ 139.endm 140 141# ROTATE_ARGS 142# Rotate values of symbols a...h 143.macro ROTATE_ARGS 144 old_h = h 145 TMP_ = h 146 h = g 147 g = f 148 f = e 149 e = d 150 d = c 151 c = b 152 b = a 153 a = TMP_ 154.endm 155 156.macro FOUR_ROUNDS_AND_SCHED disp 157################################### RND N + 0 ############################ 158 159 mov a, y3 # y3 = a # MAJA 160 rorx $25, e, y0 # y0 = e >> 25 # S1A 161 rorx $11, e, y1 # y1 = e >> 11 # S1B 162 163 addl \disp(%rsp, SRND), h # h = k + w + h # -- 164 or c, y3 # y3 = a|c # MAJA 165 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 166 mov f, y2 # y2 = f # CH 167 rorx $13, a, T1 # T1 = a >> 13 # S0B 168 169 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 170 xor g, y2 # y2 = f^g # CH 171 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 172 rorx $6, e, y1 # y1 = (e >> 6) # S1 173 174 and e, y2 # y2 = (f^g)&e # CH 175 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 176 rorx $22, a, y1 # y1 = a >> 22 # S0A 177 add h, d # d = k + w + h + d # -- 178 179 and b, y3 # y3 = (a|c)&b # MAJA 180 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 181 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 182 rorx $2, a, T1 # T1 = (a >> 2) # S0 183 184 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 185 vpsrld $7, XTMP1, XTMP2 186 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 187 mov a, T1 # T1 = a # MAJB 188 and c, T1 # T1 = a&c # MAJB 189 190 add y0, y2 # y2 = S1 + CH # -- 191 vpslld $(32-7), XTMP1, XTMP3 192 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 193 add y1, h # h = k + w + h + S0 # -- 194 195 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 196 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 197 198 vpsrld $18, XTMP1, XTMP2 199 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 200 add y3, h # h = t1 + S0 + MAJ # -- 201 202 203 ROTATE_ARGS 204 205################################### RND N + 1 ############################ 206 207 mov a, y3 # y3 = a # MAJA 208 rorx $25, e, y0 # y0 = e >> 25 # S1A 209 rorx $11, e, y1 # y1 = e >> 11 # S1B 210 offset = \disp + 1*4 211 addl offset(%rsp, SRND), h # h = k + w + h # -- 212 or c, y3 # y3 = a|c # MAJA 213 214 215 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 216 mov f, y2 # y2 = f # CH 217 rorx $13, a, T1 # T1 = a >> 13 # S0B 218 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 219 xor g, y2 # y2 = f^g # CH 220 221 222 rorx $6, e, y1 # y1 = (e >> 6) # S1 223 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 224 rorx $22, a, y1 # y1 = a >> 22 # S0A 225 and e, y2 # y2 = (f^g)&e # CH 226 add h, d # d = k + w + h + d # -- 227 228 vpslld $(32-18), XTMP1, XTMP1 229 and b, y3 # y3 = (a|c)&b # MAJA 230 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 231 232 vpxor XTMP1, XTMP3, XTMP3 233 rorx $2, a, T1 # T1 = (a >> 2) # S0 234 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 235 236 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 237 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 238 mov a, T1 # T1 = a # MAJB 239 and c, T1 # T1 = a&c # MAJB 240 add y0, y2 # y2 = S1 + CH # -- 241 242 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 243 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 244 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 245 add y1, h # h = k + w + h + S0 # -- 246 247 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 248 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 249 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 250 add y3, h # h = t1 + S0 + MAJ # -- 251 252 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 253 254 255 ROTATE_ARGS 256 257################################### RND N + 2 ############################ 258 259 mov a, y3 # y3 = a # MAJA 260 rorx $25, e, y0 # y0 = e >> 25 # S1A 261 offset = \disp + 2*4 262 addl offset(%rsp, SRND), h # h = k + w + h # -- 263 264 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 265 rorx $11, e, y1 # y1 = e >> 11 # S1B 266 or c, y3 # y3 = a|c # MAJA 267 mov f, y2 # y2 = f # CH 268 xor g, y2 # y2 = f^g # CH 269 270 rorx $13, a, T1 # T1 = a >> 13 # S0B 271 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 272 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 273 and e, y2 # y2 = (f^g)&e # CH 274 275 rorx $6, e, y1 # y1 = (e >> 6) # S1 276 vpxor XTMP3, XTMP2, XTMP2 277 add h, d # d = k + w + h + d # -- 278 and b, y3 # y3 = (a|c)&b # MAJA 279 280 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 281 rorx $22, a, y1 # y1 = a >> 22 # S0A 282 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 283 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 284 285 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 286 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 287 rorx $2, a ,T1 # T1 = (a >> 2) # S0 288 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 289 290 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 291 mov a, T1 # T1 = a # MAJB 292 and c, T1 # T1 = a&c # MAJB 293 add y0, y2 # y2 = S1 + CH # -- 294 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 295 296 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 297 add y1,h # h = k + w + h + S0 # -- 298 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 299 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 300 301 add y3,h # h = t1 + S0 + MAJ # -- 302 303 304 ROTATE_ARGS 305 306################################### RND N + 3 ############################ 307 308 mov a, y3 # y3 = a # MAJA 309 rorx $25, e, y0 # y0 = e >> 25 # S1A 310 rorx $11, e, y1 # y1 = e >> 11 # S1B 311 offset = \disp + 3*4 312 addl offset(%rsp, SRND), h # h = k + w + h # -- 313 or c, y3 # y3 = a|c # MAJA 314 315 316 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 317 mov f, y2 # y2 = f # CH 318 rorx $13, a, T1 # T1 = a >> 13 # S0B 319 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 320 xor g, y2 # y2 = f^g # CH 321 322 323 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 324 rorx $6, e, y1 # y1 = (e >> 6) # S1 325 and e, y2 # y2 = (f^g)&e # CH 326 add h, d # d = k + w + h + d # -- 327 and b, y3 # y3 = (a|c)&b # MAJA 328 329 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 330 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 331 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 332 333 vpxor XTMP3, XTMP2, XTMP2 334 rorx $22, a, y1 # y1 = a >> 22 # S0A 335 add y0, y2 # y2 = S1 + CH # -- 336 337 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 338 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 339 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 340 341 rorx $2, a, T1 # T1 = (a >> 2) # S0 342 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 343 344 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 345 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 346 mov a, T1 # T1 = a # MAJB 347 and c, T1 # T1 = a&c # MAJB 348 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 349 350 add y1, h # h = k + w + h + S0 # -- 351 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 352 add y3, h # h = t1 + S0 + MAJ # -- 353 354 ROTATE_ARGS 355 rotate_Xs 356.endm 357 358.macro DO_4ROUNDS disp 359################################### RND N + 0 ########################### 360 361 mov f, y2 # y2 = f # CH 362 rorx $25, e, y0 # y0 = e >> 25 # S1A 363 rorx $11, e, y1 # y1 = e >> 11 # S1B 364 xor g, y2 # y2 = f^g # CH 365 366 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 367 rorx $6, e, y1 # y1 = (e >> 6) # S1 368 and e, y2 # y2 = (f^g)&e # CH 369 370 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 371 rorx $13, a, T1 # T1 = a >> 13 # S0B 372 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 373 rorx $22, a, y1 # y1 = a >> 22 # S0A 374 mov a, y3 # y3 = a # MAJA 375 376 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 377 rorx $2, a, T1 # T1 = (a >> 2) # S0 378 addl \disp(%rsp, SRND), h # h = k + w + h # -- 379 or c, y3 # y3 = a|c # MAJA 380 381 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 382 mov a, T1 # T1 = a # MAJB 383 and b, y3 # y3 = (a|c)&b # MAJA 384 and c, T1 # T1 = a&c # MAJB 385 add y0, y2 # y2 = S1 + CH # -- 386 387 388 add h, d # d = k + w + h + d # -- 389 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 390 add y1, h # h = k + w + h + S0 # -- 391 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 392 393 ROTATE_ARGS 394 395################################### RND N + 1 ########################### 396 397 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 398 mov f, y2 # y2 = f # CH 399 rorx $25, e, y0 # y0 = e >> 25 # S1A 400 rorx $11, e, y1 # y1 = e >> 11 # S1B 401 xor g, y2 # y2 = f^g # CH 402 403 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 404 rorx $6, e, y1 # y1 = (e >> 6) # S1 405 and e, y2 # y2 = (f^g)&e # CH 406 add y3, old_h # h = t1 + S0 + MAJ # -- 407 408 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 409 rorx $13, a, T1 # T1 = a >> 13 # S0B 410 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 411 rorx $22, a, y1 # y1 = a >> 22 # S0A 412 mov a, y3 # y3 = a # MAJA 413 414 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 415 rorx $2, a, T1 # T1 = (a >> 2) # S0 416 offset = 4*1 + \disp 417 addl offset(%rsp, SRND), h # h = k + w + h # -- 418 or c, y3 # y3 = a|c # MAJA 419 420 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 421 mov a, T1 # T1 = a # MAJB 422 and b, y3 # y3 = (a|c)&b # MAJA 423 and c, T1 # T1 = a&c # MAJB 424 add y0, y2 # y2 = S1 + CH # -- 425 426 427 add h, d # d = k + w + h + d # -- 428 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 429 add y1, h # h = k + w + h + S0 # -- 430 431 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 432 433 ROTATE_ARGS 434 435################################### RND N + 2 ############################## 436 437 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 438 mov f, y2 # y2 = f # CH 439 rorx $25, e, y0 # y0 = e >> 25 # S1A 440 rorx $11, e, y1 # y1 = e >> 11 # S1B 441 xor g, y2 # y2 = f^g # CH 442 443 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 444 rorx $6, e, y1 # y1 = (e >> 6) # S1 445 and e, y2 # y2 = (f^g)&e # CH 446 add y3, old_h # h = t1 + S0 + MAJ # -- 447 448 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 449 rorx $13, a, T1 # T1 = a >> 13 # S0B 450 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 451 rorx $22, a, y1 # y1 = a >> 22 # S0A 452 mov a, y3 # y3 = a # MAJA 453 454 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 455 rorx $2, a, T1 # T1 = (a >> 2) # S0 456 offset = 4*2 + \disp 457 addl offset(%rsp, SRND), h # h = k + w + h # -- 458 or c, y3 # y3 = a|c # MAJA 459 460 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 461 mov a, T1 # T1 = a # MAJB 462 and b, y3 # y3 = (a|c)&b # MAJA 463 and c, T1 # T1 = a&c # MAJB 464 add y0, y2 # y2 = S1 + CH # -- 465 466 467 add h, d # d = k + w + h + d # -- 468 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 469 add y1, h # h = k + w + h + S0 # -- 470 471 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 472 473 ROTATE_ARGS 474 475################################### RND N + 3 ########################### 476 477 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 478 mov f, y2 # y2 = f # CH 479 rorx $25, e, y0 # y0 = e >> 25 # S1A 480 rorx $11, e, y1 # y1 = e >> 11 # S1B 481 xor g, y2 # y2 = f^g # CH 482 483 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 484 rorx $6, e, y1 # y1 = (e >> 6) # S1 485 and e, y2 # y2 = (f^g)&e # CH 486 add y3, old_h # h = t1 + S0 + MAJ # -- 487 488 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 489 rorx $13, a, T1 # T1 = a >> 13 # S0B 490 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 491 rorx $22, a, y1 # y1 = a >> 22 # S0A 492 mov a, y3 # y3 = a # MAJA 493 494 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 495 rorx $2, a, T1 # T1 = (a >> 2) # S0 496 offset = 4*3 + \disp 497 addl offset(%rsp, SRND), h # h = k + w + h # -- 498 or c, y3 # y3 = a|c # MAJA 499 500 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 501 mov a, T1 # T1 = a # MAJB 502 and b, y3 # y3 = (a|c)&b # MAJA 503 and c, T1 # T1 = a&c # MAJB 504 add y0, y2 # y2 = S1 + CH # -- 505 506 507 add h, d # d = k + w + h + d # -- 508 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 509 add y1, h # h = k + w + h + S0 # -- 510 511 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 512 513 514 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 515 516 add y3, h # h = t1 + S0 + MAJ # -- 517 518 ROTATE_ARGS 519 520.endm 521 522######################################################################## 523## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 524## arg 1 : pointer to digest 525## arg 2 : pointer to input data 526## arg 3 : Num blocks 527######################################################################## 528.text 529ENTRY(sha256_transform_rorx) 530.align 32 531 pushq %rbx 532 pushq %r12 533 pushq %r13 534 pushq %r14 535 pushq %r15 536 537 mov %rsp, %rax 538 subq $STACK_SIZE, %rsp 539 and $-32, %rsp # align rsp to 32 byte boundary 540 mov %rax, _RSP(%rsp) 541 542 543 shl $6, NUM_BLKS # convert to bytes 544 jz done_hash 545 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 546 mov NUM_BLKS, _INP_END(%rsp) 547 548 cmp NUM_BLKS, INP 549 je only_one_block 550 551 ## load initial digest 552 mov (CTX), a 553 mov 4*1(CTX), b 554 mov 4*2(CTX), c 555 mov 4*3(CTX), d 556 mov 4*4(CTX), e 557 mov 4*5(CTX), f 558 mov 4*6(CTX), g 559 mov 4*7(CTX), h 560 561 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 562 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 563 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 564 565 mov CTX, _CTX(%rsp) 566 567loop0: 568 ## Load first 16 dwords from two blocks 569 VMOVDQ 0*32(INP),XTMP0 570 VMOVDQ 1*32(INP),XTMP1 571 VMOVDQ 2*32(INP),XTMP2 572 VMOVDQ 3*32(INP),XTMP3 573 574 ## byte swap data 575 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 576 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 577 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 578 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 579 580 ## transpose data into high/low halves 581 vperm2i128 $0x20, XTMP2, XTMP0, X0 582 vperm2i128 $0x31, XTMP2, XTMP0, X1 583 vperm2i128 $0x20, XTMP3, XTMP1, X2 584 vperm2i128 $0x31, XTMP3, XTMP1, X3 585 586last_block_enter: 587 add $64, INP 588 mov INP, _INP(%rsp) 589 590 ## schedule 48 input dwords, by doing 3 rounds of 12 each 591 xor SRND, SRND 592 593.align 16 594loop1: 595 vpaddd K256+0*32(SRND), X0, XFER 596 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 597 FOUR_ROUNDS_AND_SCHED _XFER + 0*32 598 599 vpaddd K256+1*32(SRND), X0, XFER 600 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 601 FOUR_ROUNDS_AND_SCHED _XFER + 1*32 602 603 vpaddd K256+2*32(SRND), X0, XFER 604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 605 FOUR_ROUNDS_AND_SCHED _XFER + 2*32 606 607 vpaddd K256+3*32(SRND), X0, XFER 608 vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 609 FOUR_ROUNDS_AND_SCHED _XFER + 3*32 610 611 add $4*32, SRND 612 cmp $3*4*32, SRND 613 jb loop1 614 615loop2: 616 ## Do last 16 rounds with no scheduling 617 vpaddd K256+0*32(SRND), X0, XFER 618 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 619 DO_4ROUNDS _XFER + 0*32 620 621 vpaddd K256+1*32(SRND), X1, XFER 622 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 623 DO_4ROUNDS _XFER + 1*32 624 add $2*32, SRND 625 626 vmovdqa X2, X0 627 vmovdqa X3, X1 628 629 cmp $4*4*32, SRND 630 jb loop2 631 632 mov _CTX(%rsp), CTX 633 mov _INP(%rsp), INP 634 635 addm (4*0)(CTX),a 636 addm (4*1)(CTX),b 637 addm (4*2)(CTX),c 638 addm (4*3)(CTX),d 639 addm (4*4)(CTX),e 640 addm (4*5)(CTX),f 641 addm (4*6)(CTX),g 642 addm (4*7)(CTX),h 643 644 cmp _INP_END(%rsp), INP 645 ja done_hash 646 647 #### Do second block using previously scheduled results 648 xor SRND, SRND 649.align 16 650loop3: 651 DO_4ROUNDS _XFER + 0*32 + 16 652 DO_4ROUNDS _XFER + 1*32 + 16 653 add $2*32, SRND 654 cmp $4*4*32, SRND 655 jb loop3 656 657 mov _CTX(%rsp), CTX 658 mov _INP(%rsp), INP 659 add $64, INP 660 661 addm (4*0)(CTX),a 662 addm (4*1)(CTX),b 663 addm (4*2)(CTX),c 664 addm (4*3)(CTX),d 665 addm (4*4)(CTX),e 666 addm (4*5)(CTX),f 667 addm (4*6)(CTX),g 668 addm (4*7)(CTX),h 669 670 cmp _INP_END(%rsp), INP 671 jb loop0 672 ja done_hash 673 674do_last_block: 675 VMOVDQ 0*16(INP),XWORD0 676 VMOVDQ 1*16(INP),XWORD1 677 VMOVDQ 2*16(INP),XWORD2 678 VMOVDQ 3*16(INP),XWORD3 679 680 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 681 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 682 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 683 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 684 685 jmp last_block_enter 686 687only_one_block: 688 689 ## load initial digest 690 mov (4*0)(CTX),a 691 mov (4*1)(CTX),b 692 mov (4*2)(CTX),c 693 mov (4*3)(CTX),d 694 mov (4*4)(CTX),e 695 mov (4*5)(CTX),f 696 mov (4*6)(CTX),g 697 mov (4*7)(CTX),h 698 699 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 700 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 701 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 702 703 mov CTX, _CTX(%rsp) 704 jmp do_last_block 705 706done_hash: 707 708 mov _RSP(%rsp), %rsp 709 710 popq %r15 711 popq %r14 712 popq %r13 713 popq %r12 714 popq %rbx 715 ret 716ENDPROC(sha256_transform_rorx) 717 718.section .rodata.cst512.K256, "aM", @progbits, 512 719.align 64 720K256: 721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 753 754.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 755.align 32 756PSHUFFLE_BYTE_FLIP_MASK: 757 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 758 759# shuffle xBxA -> 00BA 760.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 761.align 32 762_SHUF_00BA: 763 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 764 765# shuffle xDxC -> DC00 766.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 767.align 32 768_SHUF_DC00: 769 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 770 771#endif 772