1######################################################################## 2# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48# This code schedules 2 blocks at a time, with 4 lanes per block 49######################################################################## 50 51#include <linux/linkage.h> 52 53## assume buffers not aligned 54#define VMOVDQ vmovdqu 55 56################################ Define Macros 57 58# addm [mem], reg 59# Add reg to mem using reg-mem add and store 60.macro addm p1 p2 61 add \p1, \p2 62 mov \p2, \p1 63.endm 64 65################################ 66 67X0 = %ymm4 68X1 = %ymm5 69X2 = %ymm6 70X3 = %ymm7 71 72# XMM versions of above 73XWORD0 = %xmm4 74XWORD1 = %xmm5 75XWORD2 = %xmm6 76XWORD3 = %xmm7 77 78XTMP0 = %ymm0 79XTMP1 = %ymm1 80XTMP2 = %ymm2 81XTMP3 = %ymm3 82XTMP4 = %ymm8 83XFER = %ymm9 84XTMP5 = %ymm11 85 86SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 87SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 88BYTE_FLIP_MASK = %ymm13 89 90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 91 92NUM_BLKS = %rdx # 3rd arg 93INP = %rsi # 2nd arg 94CTX = %rdi # 1st arg 95c = %ecx 96d = %r8d 97e = %edx # clobbers NUM_BLKS 98y3 = %esi # clobbers INP 99 100SRND = CTX # SRND is same register as CTX 101 102a = %eax 103b = %ebx 104f = %r9d 105g = %r10d 106h = %r11d 107old_h = %r11d 108 109T1 = %r12d 110y0 = %r13d 111y1 = %r14d 112y2 = %r15d 113 114 115_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 116_XMM_SAVE_SIZE = 0 117_INP_END_SIZE = 8 118_INP_SIZE = 8 119_CTX_SIZE = 8 120_RSP_SIZE = 8 121 122_XFER = 0 123_XMM_SAVE = _XFER + _XFER_SIZE 124_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 125_INP = _INP_END + _INP_END_SIZE 126_CTX = _INP + _INP_SIZE 127_RSP = _CTX + _CTX_SIZE 128STACK_SIZE = _RSP + _RSP_SIZE 129 130# rotate_Xs 131# Rotate values of symbols X0...X3 132.macro rotate_Xs 133 X_ = X0 134 X0 = X1 135 X1 = X2 136 X2 = X3 137 X3 = X_ 138.endm 139 140# ROTATE_ARGS 141# Rotate values of symbols a...h 142.macro ROTATE_ARGS 143 old_h = h 144 TMP_ = h 145 h = g 146 g = f 147 f = e 148 e = d 149 d = c 150 c = b 151 b = a 152 a = TMP_ 153.endm 154 155.macro FOUR_ROUNDS_AND_SCHED disp 156################################### RND N + 0 ############################ 157 158 mov a, y3 # y3 = a # MAJA 159 rorx $25, e, y0 # y0 = e >> 25 # S1A 160 rorx $11, e, y1 # y1 = e >> 11 # S1B 161 162 addl \disp(%rsp, SRND), h # h = k + w + h # -- 163 or c, y3 # y3 = a|c # MAJA 164 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 165 mov f, y2 # y2 = f # CH 166 rorx $13, a, T1 # T1 = a >> 13 # S0B 167 168 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 169 xor g, y2 # y2 = f^g # CH 170 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 171 rorx $6, e, y1 # y1 = (e >> 6) # S1 172 173 and e, y2 # y2 = (f^g)&e # CH 174 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 175 rorx $22, a, y1 # y1 = a >> 22 # S0A 176 add h, d # d = k + w + h + d # -- 177 178 and b, y3 # y3 = (a|c)&b # MAJA 179 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 180 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 181 rorx $2, a, T1 # T1 = (a >> 2) # S0 182 183 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 184 vpsrld $7, XTMP1, XTMP2 185 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 186 mov a, T1 # T1 = a # MAJB 187 and c, T1 # T1 = a&c # MAJB 188 189 add y0, y2 # y2 = S1 + CH # -- 190 vpslld $(32-7), XTMP1, XTMP3 191 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 192 add y1, h # h = k + w + h + S0 # -- 193 194 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 195 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 196 197 vpsrld $18, XTMP1, XTMP2 198 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 199 add y3, h # h = t1 + S0 + MAJ # -- 200 201 202 ROTATE_ARGS 203 204################################### RND N + 1 ############################ 205 206 mov a, y3 # y3 = a # MAJA 207 rorx $25, e, y0 # y0 = e >> 25 # S1A 208 rorx $11, e, y1 # y1 = e >> 11 # S1B 209 offset = \disp + 1*4 210 addl offset(%rsp, SRND), h # h = k + w + h # -- 211 or c, y3 # y3 = a|c # MAJA 212 213 214 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 215 mov f, y2 # y2 = f # CH 216 rorx $13, a, T1 # T1 = a >> 13 # S0B 217 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 218 xor g, y2 # y2 = f^g # CH 219 220 221 rorx $6, e, y1 # y1 = (e >> 6) # S1 222 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 223 rorx $22, a, y1 # y1 = a >> 22 # S0A 224 and e, y2 # y2 = (f^g)&e # CH 225 add h, d # d = k + w + h + d # -- 226 227 vpslld $(32-18), XTMP1, XTMP1 228 and b, y3 # y3 = (a|c)&b # MAJA 229 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 230 231 vpxor XTMP1, XTMP3, XTMP3 232 rorx $2, a, T1 # T1 = (a >> 2) # S0 233 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 234 235 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 236 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 237 mov a, T1 # T1 = a # MAJB 238 and c, T1 # T1 = a&c # MAJB 239 add y0, y2 # y2 = S1 + CH # -- 240 241 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 242 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 243 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 244 add y1, h # h = k + w + h + S0 # -- 245 246 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 247 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 248 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 249 add y3, h # h = t1 + S0 + MAJ # -- 250 251 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 252 253 254 ROTATE_ARGS 255 256################################### RND N + 2 ############################ 257 258 mov a, y3 # y3 = a # MAJA 259 rorx $25, e, y0 # y0 = e >> 25 # S1A 260 offset = \disp + 2*4 261 addl offset(%rsp, SRND), h # h = k + w + h # -- 262 263 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 264 rorx $11, e, y1 # y1 = e >> 11 # S1B 265 or c, y3 # y3 = a|c # MAJA 266 mov f, y2 # y2 = f # CH 267 xor g, y2 # y2 = f^g # CH 268 269 rorx $13, a, T1 # T1 = a >> 13 # S0B 270 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 271 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 272 and e, y2 # y2 = (f^g)&e # CH 273 274 rorx $6, e, y1 # y1 = (e >> 6) # S1 275 vpxor XTMP3, XTMP2, XTMP2 276 add h, d # d = k + w + h + d # -- 277 and b, y3 # y3 = (a|c)&b # MAJA 278 279 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 280 rorx $22, a, y1 # y1 = a >> 22 # S0A 281 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 282 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 283 284 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 285 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 286 rorx $2, a ,T1 # T1 = (a >> 2) # S0 287 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 288 289 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 290 mov a, T1 # T1 = a # MAJB 291 and c, T1 # T1 = a&c # MAJB 292 add y0, y2 # y2 = S1 + CH # -- 293 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 294 295 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 296 add y1,h # h = k + w + h + S0 # -- 297 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 298 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 299 300 add y3,h # h = t1 + S0 + MAJ # -- 301 302 303 ROTATE_ARGS 304 305################################### RND N + 3 ############################ 306 307 mov a, y3 # y3 = a # MAJA 308 rorx $25, e, y0 # y0 = e >> 25 # S1A 309 rorx $11, e, y1 # y1 = e >> 11 # S1B 310 offset = \disp + 3*4 311 addl offset(%rsp, SRND), h # h = k + w + h # -- 312 or c, y3 # y3 = a|c # MAJA 313 314 315 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 316 mov f, y2 # y2 = f # CH 317 rorx $13, a, T1 # T1 = a >> 13 # S0B 318 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 319 xor g, y2 # y2 = f^g # CH 320 321 322 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 323 rorx $6, e, y1 # y1 = (e >> 6) # S1 324 and e, y2 # y2 = (f^g)&e # CH 325 add h, d # d = k + w + h + d # -- 326 and b, y3 # y3 = (a|c)&b # MAJA 327 328 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 329 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 330 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 331 332 vpxor XTMP3, XTMP2, XTMP2 333 rorx $22, a, y1 # y1 = a >> 22 # S0A 334 add y0, y2 # y2 = S1 + CH # -- 335 336 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 337 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 338 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 339 340 rorx $2, a, T1 # T1 = (a >> 2) # S0 341 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 342 343 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 344 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 345 mov a, T1 # T1 = a # MAJB 346 and c, T1 # T1 = a&c # MAJB 347 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 348 349 add y1, h # h = k + w + h + S0 # -- 350 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 351 add y3, h # h = t1 + S0 + MAJ # -- 352 353 ROTATE_ARGS 354 rotate_Xs 355.endm 356 357.macro DO_4ROUNDS disp 358################################### RND N + 0 ########################### 359 360 mov f, y2 # y2 = f # CH 361 rorx $25, e, y0 # y0 = e >> 25 # S1A 362 rorx $11, e, y1 # y1 = e >> 11 # S1B 363 xor g, y2 # y2 = f^g # CH 364 365 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 366 rorx $6, e, y1 # y1 = (e >> 6) # S1 367 and e, y2 # y2 = (f^g)&e # CH 368 369 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 370 rorx $13, a, T1 # T1 = a >> 13 # S0B 371 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 372 rorx $22, a, y1 # y1 = a >> 22 # S0A 373 mov a, y3 # y3 = a # MAJA 374 375 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 376 rorx $2, a, T1 # T1 = (a >> 2) # S0 377 addl \disp(%rsp, SRND), h # h = k + w + h # -- 378 or c, y3 # y3 = a|c # MAJA 379 380 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 381 mov a, T1 # T1 = a # MAJB 382 and b, y3 # y3 = (a|c)&b # MAJA 383 and c, T1 # T1 = a&c # MAJB 384 add y0, y2 # y2 = S1 + CH # -- 385 386 387 add h, d # d = k + w + h + d # -- 388 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 389 add y1, h # h = k + w + h + S0 # -- 390 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 391 392 ROTATE_ARGS 393 394################################### RND N + 1 ########################### 395 396 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 397 mov f, y2 # y2 = f # CH 398 rorx $25, e, y0 # y0 = e >> 25 # S1A 399 rorx $11, e, y1 # y1 = e >> 11 # S1B 400 xor g, y2 # y2 = f^g # CH 401 402 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 403 rorx $6, e, y1 # y1 = (e >> 6) # S1 404 and e, y2 # y2 = (f^g)&e # CH 405 add y3, old_h # h = t1 + S0 + MAJ # -- 406 407 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 408 rorx $13, a, T1 # T1 = a >> 13 # S0B 409 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 410 rorx $22, a, y1 # y1 = a >> 22 # S0A 411 mov a, y3 # y3 = a # MAJA 412 413 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 414 rorx $2, a, T1 # T1 = (a >> 2) # S0 415 offset = 4*1 + \disp 416 addl offset(%rsp, SRND), h # h = k + w + h # -- 417 or c, y3 # y3 = a|c # MAJA 418 419 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 420 mov a, T1 # T1 = a # MAJB 421 and b, y3 # y3 = (a|c)&b # MAJA 422 and c, T1 # T1 = a&c # MAJB 423 add y0, y2 # y2 = S1 + CH # -- 424 425 426 add h, d # d = k + w + h + d # -- 427 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 428 add y1, h # h = k + w + h + S0 # -- 429 430 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 431 432 ROTATE_ARGS 433 434################################### RND N + 2 ############################## 435 436 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 437 mov f, y2 # y2 = f # CH 438 rorx $25, e, y0 # y0 = e >> 25 # S1A 439 rorx $11, e, y1 # y1 = e >> 11 # S1B 440 xor g, y2 # y2 = f^g # CH 441 442 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 443 rorx $6, e, y1 # y1 = (e >> 6) # S1 444 and e, y2 # y2 = (f^g)&e # CH 445 add y3, old_h # h = t1 + S0 + MAJ # -- 446 447 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 448 rorx $13, a, T1 # T1 = a >> 13 # S0B 449 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 450 rorx $22, a, y1 # y1 = a >> 22 # S0A 451 mov a, y3 # y3 = a # MAJA 452 453 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 454 rorx $2, a, T1 # T1 = (a >> 2) # S0 455 offset = 4*2 + \disp 456 addl offset(%rsp, SRND), h # h = k + w + h # -- 457 or c, y3 # y3 = a|c # MAJA 458 459 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 460 mov a, T1 # T1 = a # MAJB 461 and b, y3 # y3 = (a|c)&b # MAJA 462 and c, T1 # T1 = a&c # MAJB 463 add y0, y2 # y2 = S1 + CH # -- 464 465 466 add h, d # d = k + w + h + d # -- 467 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 468 add y1, h # h = k + w + h + S0 # -- 469 470 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 471 472 ROTATE_ARGS 473 474################################### RND N + 3 ########################### 475 476 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 477 mov f, y2 # y2 = f # CH 478 rorx $25, e, y0 # y0 = e >> 25 # S1A 479 rorx $11, e, y1 # y1 = e >> 11 # S1B 480 xor g, y2 # y2 = f^g # CH 481 482 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 483 rorx $6, e, y1 # y1 = (e >> 6) # S1 484 and e, y2 # y2 = (f^g)&e # CH 485 add y3, old_h # h = t1 + S0 + MAJ # -- 486 487 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 488 rorx $13, a, T1 # T1 = a >> 13 # S0B 489 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 490 rorx $22, a, y1 # y1 = a >> 22 # S0A 491 mov a, y3 # y3 = a # MAJA 492 493 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 494 rorx $2, a, T1 # T1 = (a >> 2) # S0 495 offset = 4*3 + \disp 496 addl offset(%rsp, SRND), h # h = k + w + h # -- 497 or c, y3 # y3 = a|c # MAJA 498 499 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 500 mov a, T1 # T1 = a # MAJB 501 and b, y3 # y3 = (a|c)&b # MAJA 502 and c, T1 # T1 = a&c # MAJB 503 add y0, y2 # y2 = S1 + CH # -- 504 505 506 add h, d # d = k + w + h + d # -- 507 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 508 add y1, h # h = k + w + h + S0 # -- 509 510 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 511 512 513 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 514 515 add y3, h # h = t1 + S0 + MAJ # -- 516 517 ROTATE_ARGS 518 519.endm 520 521######################################################################## 522## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) 523## arg 1 : pointer to state 524## arg 2 : pointer to input data 525## arg 3 : Num blocks 526######################################################################## 527.text 528SYM_FUNC_START(sha256_transform_rorx) 529.align 32 530 pushq %rbx 531 pushq %r12 532 pushq %r13 533 pushq %r14 534 pushq %r15 535 536 mov %rsp, %rax 537 subq $STACK_SIZE, %rsp 538 and $-32, %rsp # align rsp to 32 byte boundary 539 mov %rax, _RSP(%rsp) 540 541 542 shl $6, NUM_BLKS # convert to bytes 543 jz done_hash 544 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 545 mov NUM_BLKS, _INP_END(%rsp) 546 547 cmp NUM_BLKS, INP 548 je only_one_block 549 550 ## load initial digest 551 mov (CTX), a 552 mov 4*1(CTX), b 553 mov 4*2(CTX), c 554 mov 4*3(CTX), d 555 mov 4*4(CTX), e 556 mov 4*5(CTX), f 557 mov 4*6(CTX), g 558 mov 4*7(CTX), h 559 560 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 561 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 562 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 563 564 mov CTX, _CTX(%rsp) 565 566loop0: 567 ## Load first 16 dwords from two blocks 568 VMOVDQ 0*32(INP),XTMP0 569 VMOVDQ 1*32(INP),XTMP1 570 VMOVDQ 2*32(INP),XTMP2 571 VMOVDQ 3*32(INP),XTMP3 572 573 ## byte swap data 574 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 575 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 576 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 577 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 578 579 ## transpose data into high/low halves 580 vperm2i128 $0x20, XTMP2, XTMP0, X0 581 vperm2i128 $0x31, XTMP2, XTMP0, X1 582 vperm2i128 $0x20, XTMP3, XTMP1, X2 583 vperm2i128 $0x31, XTMP3, XTMP1, X3 584 585last_block_enter: 586 add $64, INP 587 mov INP, _INP(%rsp) 588 589 ## schedule 48 input dwords, by doing 3 rounds of 12 each 590 xor SRND, SRND 591 592.align 16 593loop1: 594 vpaddd K256+0*32(SRND), X0, XFER 595 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 596 FOUR_ROUNDS_AND_SCHED _XFER + 0*32 597 598 vpaddd K256+1*32(SRND), X0, XFER 599 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 600 FOUR_ROUNDS_AND_SCHED _XFER + 1*32 601 602 vpaddd K256+2*32(SRND), X0, XFER 603 vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 604 FOUR_ROUNDS_AND_SCHED _XFER + 2*32 605 606 vpaddd K256+3*32(SRND), X0, XFER 607 vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 608 FOUR_ROUNDS_AND_SCHED _XFER + 3*32 609 610 add $4*32, SRND 611 cmp $3*4*32, SRND 612 jb loop1 613 614loop2: 615 ## Do last 16 rounds with no scheduling 616 vpaddd K256+0*32(SRND), X0, XFER 617 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 618 DO_4ROUNDS _XFER + 0*32 619 620 vpaddd K256+1*32(SRND), X1, XFER 621 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 622 DO_4ROUNDS _XFER + 1*32 623 add $2*32, SRND 624 625 vmovdqa X2, X0 626 vmovdqa X3, X1 627 628 cmp $4*4*32, SRND 629 jb loop2 630 631 mov _CTX(%rsp), CTX 632 mov _INP(%rsp), INP 633 634 addm (4*0)(CTX),a 635 addm (4*1)(CTX),b 636 addm (4*2)(CTX),c 637 addm (4*3)(CTX),d 638 addm (4*4)(CTX),e 639 addm (4*5)(CTX),f 640 addm (4*6)(CTX),g 641 addm (4*7)(CTX),h 642 643 cmp _INP_END(%rsp), INP 644 ja done_hash 645 646 #### Do second block using previously scheduled results 647 xor SRND, SRND 648.align 16 649loop3: 650 DO_4ROUNDS _XFER + 0*32 + 16 651 DO_4ROUNDS _XFER + 1*32 + 16 652 add $2*32, SRND 653 cmp $4*4*32, SRND 654 jb loop3 655 656 mov _CTX(%rsp), CTX 657 mov _INP(%rsp), INP 658 add $64, INP 659 660 addm (4*0)(CTX),a 661 addm (4*1)(CTX),b 662 addm (4*2)(CTX),c 663 addm (4*3)(CTX),d 664 addm (4*4)(CTX),e 665 addm (4*5)(CTX),f 666 addm (4*6)(CTX),g 667 addm (4*7)(CTX),h 668 669 cmp _INP_END(%rsp), INP 670 jb loop0 671 ja done_hash 672 673do_last_block: 674 VMOVDQ 0*16(INP),XWORD0 675 VMOVDQ 1*16(INP),XWORD1 676 VMOVDQ 2*16(INP),XWORD2 677 VMOVDQ 3*16(INP),XWORD3 678 679 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 680 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 681 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 682 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 683 684 jmp last_block_enter 685 686only_one_block: 687 688 ## load initial digest 689 mov (4*0)(CTX),a 690 mov (4*1)(CTX),b 691 mov (4*2)(CTX),c 692 mov (4*3)(CTX),d 693 mov (4*4)(CTX),e 694 mov (4*5)(CTX),f 695 mov (4*6)(CTX),g 696 mov (4*7)(CTX),h 697 698 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 699 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 700 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 701 702 mov CTX, _CTX(%rsp) 703 jmp do_last_block 704 705done_hash: 706 707 mov _RSP(%rsp), %rsp 708 709 popq %r15 710 popq %r14 711 popq %r13 712 popq %r12 713 popq %rbx 714 ret 715SYM_FUNC_END(sha256_transform_rorx) 716 717.section .rodata.cst512.K256, "aM", @progbits, 512 718.align 64 719K256: 720 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 722 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 724 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 726 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 728 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 730 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 732 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 734 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 736 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 738 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 740 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 742 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 744 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 746 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 748 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 750 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 752 753.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 754.align 32 755PSHUFFLE_BYTE_FLIP_MASK: 756 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 757 758# shuffle xBxA -> 00BA 759.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 760.align 32 761_SHUF_00BA: 762 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 763 764# shuffle xDxC -> DC00 765.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 766.align 32 767_SHUF_DC00: 768 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 769