1######################################################################## 2# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48# This code schedules 2 blocks at a time, with 4 lanes per block 49######################################################################## 50 51#include <linux/linkage.h> 52#include <linux/cfi_types.h> 53 54## assume buffers not aligned 55#define VMOVDQ vmovdqu 56 57################################ Define Macros 58 59# addm [mem], reg 60# Add reg to mem using reg-mem add and store 61.macro addm p1 p2 62 add \p1, \p2 63 mov \p2, \p1 64.endm 65 66################################ 67 68X0 = %ymm4 69X1 = %ymm5 70X2 = %ymm6 71X3 = %ymm7 72 73# XMM versions of above 74XWORD0 = %xmm4 75XWORD1 = %xmm5 76XWORD2 = %xmm6 77XWORD3 = %xmm7 78 79XTMP0 = %ymm0 80XTMP1 = %ymm1 81XTMP2 = %ymm2 82XTMP3 = %ymm3 83XTMP4 = %ymm8 84XFER = %ymm9 85XTMP5 = %ymm11 86 87SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 88SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 89BYTE_FLIP_MASK = %ymm13 90 91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 92 93NUM_BLKS = %rdx # 3rd arg 94INP = %rsi # 2nd arg 95CTX = %rdi # 1st arg 96c = %ecx 97d = %r8d 98e = %edx # clobbers NUM_BLKS 99y3 = %esi # clobbers INP 100 101SRND = CTX # SRND is same register as CTX 102 103a = %eax 104b = %ebx 105f = %r9d 106g = %r10d 107h = %r11d 108old_h = %r11d 109 110T1 = %r12d 111y0 = %r13d 112y1 = %r14d 113y2 = %r15d 114 115 116_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 117_XMM_SAVE_SIZE = 0 118_INP_END_SIZE = 8 119_INP_SIZE = 8 120_CTX_SIZE = 8 121 122_XFER = 0 123_XMM_SAVE = _XFER + _XFER_SIZE 124_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 125_INP = _INP_END + _INP_END_SIZE 126_CTX = _INP + _INP_SIZE 127STACK_SIZE = _CTX + _CTX_SIZE 128 129# rotate_Xs 130# Rotate values of symbols X0...X3 131.macro rotate_Xs 132 X_ = X0 133 X0 = X1 134 X1 = X2 135 X2 = X3 136 X3 = X_ 137.endm 138 139# ROTATE_ARGS 140# Rotate values of symbols a...h 141.macro ROTATE_ARGS 142 old_h = h 143 TMP_ = h 144 h = g 145 g = f 146 f = e 147 e = d 148 d = c 149 c = b 150 b = a 151 a = TMP_ 152.endm 153 154.macro FOUR_ROUNDS_AND_SCHED disp 155################################### RND N + 0 ############################ 156 157 mov a, y3 # y3 = a # MAJA 158 rorx $25, e, y0 # y0 = e >> 25 # S1A 159 rorx $11, e, y1 # y1 = e >> 11 # S1B 160 161 addl \disp(%rsp, SRND), h # h = k + w + h # -- 162 or c, y3 # y3 = a|c # MAJA 163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 164 mov f, y2 # y2 = f # CH 165 rorx $13, a, T1 # T1 = a >> 13 # S0B 166 167 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 168 xor g, y2 # y2 = f^g # CH 169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 170 rorx $6, e, y1 # y1 = (e >> 6) # S1 171 172 and e, y2 # y2 = (f^g)&e # CH 173 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 174 rorx $22, a, y1 # y1 = a >> 22 # S0A 175 add h, d # d = k + w + h + d # -- 176 177 and b, y3 # y3 = (a|c)&b # MAJA 178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 179 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 180 rorx $2, a, T1 # T1 = (a >> 2) # S0 181 182 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 183 vpsrld $7, XTMP1, XTMP2 184 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 185 mov a, T1 # T1 = a # MAJB 186 and c, T1 # T1 = a&c # MAJB 187 188 add y0, y2 # y2 = S1 + CH # -- 189 vpslld $(32-7), XTMP1, XTMP3 190 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 191 add y1, h # h = k + w + h + S0 # -- 192 193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 195 196 vpsrld $18, XTMP1, XTMP2 197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 198 add y3, h # h = t1 + S0 + MAJ # -- 199 200 201 ROTATE_ARGS 202 203################################### RND N + 1 ############################ 204 205 mov a, y3 # y3 = a # MAJA 206 rorx $25, e, y0 # y0 = e >> 25 # S1A 207 rorx $11, e, y1 # y1 = e >> 11 # S1B 208 offset = \disp + 1*4 209 addl offset(%rsp, SRND), h # h = k + w + h # -- 210 or c, y3 # y3 = a|c # MAJA 211 212 213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 214 mov f, y2 # y2 = f # CH 215 rorx $13, a, T1 # T1 = a >> 13 # S0B 216 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 217 xor g, y2 # y2 = f^g # CH 218 219 220 rorx $6, e, y1 # y1 = (e >> 6) # S1 221 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 222 rorx $22, a, y1 # y1 = a >> 22 # S0A 223 and e, y2 # y2 = (f^g)&e # CH 224 add h, d # d = k + w + h + d # -- 225 226 vpslld $(32-18), XTMP1, XTMP1 227 and b, y3 # y3 = (a|c)&b # MAJA 228 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 229 230 vpxor XTMP1, XTMP3, XTMP3 231 rorx $2, a, T1 # T1 = (a >> 2) # S0 232 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 233 234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 235 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 236 mov a, T1 # T1 = a # MAJB 237 and c, T1 # T1 = a&c # MAJB 238 add y0, y2 # y2 = S1 + CH # -- 239 240 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 242 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 243 add y1, h # h = k + w + h + S0 # -- 244 245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 248 add y3, h # h = t1 + S0 + MAJ # -- 249 250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 251 252 253 ROTATE_ARGS 254 255################################### RND N + 2 ############################ 256 257 mov a, y3 # y3 = a # MAJA 258 rorx $25, e, y0 # y0 = e >> 25 # S1A 259 offset = \disp + 2*4 260 addl offset(%rsp, SRND), h # h = k + w + h # -- 261 262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 263 rorx $11, e, y1 # y1 = e >> 11 # S1B 264 or c, y3 # y3 = a|c # MAJA 265 mov f, y2 # y2 = f # CH 266 xor g, y2 # y2 = f^g # CH 267 268 rorx $13, a, T1 # T1 = a >> 13 # S0B 269 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 271 and e, y2 # y2 = (f^g)&e # CH 272 273 rorx $6, e, y1 # y1 = (e >> 6) # S1 274 vpxor XTMP3, XTMP2, XTMP2 275 add h, d # d = k + w + h + d # -- 276 and b, y3 # y3 = (a|c)&b # MAJA 277 278 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 279 rorx $22, a, y1 # y1 = a >> 22 # S0A 280 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 281 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 282 283 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 284 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 285 rorx $2, a ,T1 # T1 = (a >> 2) # S0 286 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 287 288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 289 mov a, T1 # T1 = a # MAJB 290 and c, T1 # T1 = a&c # MAJB 291 add y0, y2 # y2 = S1 + CH # -- 292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 293 294 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 295 add y1,h # h = k + w + h + S0 # -- 296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 298 299 add y3,h # h = t1 + S0 + MAJ # -- 300 301 302 ROTATE_ARGS 303 304################################### RND N + 3 ############################ 305 306 mov a, y3 # y3 = a # MAJA 307 rorx $25, e, y0 # y0 = e >> 25 # S1A 308 rorx $11, e, y1 # y1 = e >> 11 # S1B 309 offset = \disp + 3*4 310 addl offset(%rsp, SRND), h # h = k + w + h # -- 311 or c, y3 # y3 = a|c # MAJA 312 313 314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 315 mov f, y2 # y2 = f # CH 316 rorx $13, a, T1 # T1 = a >> 13 # S0B 317 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 318 xor g, y2 # y2 = f^g # CH 319 320 321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 322 rorx $6, e, y1 # y1 = (e >> 6) # S1 323 and e, y2 # y2 = (f^g)&e # CH 324 add h, d # d = k + w + h + d # -- 325 and b, y3 # y3 = (a|c)&b # MAJA 326 327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 328 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 329 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 330 331 vpxor XTMP3, XTMP2, XTMP2 332 rorx $22, a, y1 # y1 = a >> 22 # S0A 333 add y0, y2 # y2 = S1 + CH # -- 334 335 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 336 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 338 339 rorx $2, a, T1 # T1 = (a >> 2) # S0 340 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 341 342 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 343 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 344 mov a, T1 # T1 = a # MAJB 345 and c, T1 # T1 = a&c # MAJB 346 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 347 348 add y1, h # h = k + w + h + S0 # -- 349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 350 add y3, h # h = t1 + S0 + MAJ # -- 351 352 ROTATE_ARGS 353 rotate_Xs 354.endm 355 356.macro DO_4ROUNDS disp 357################################### RND N + 0 ########################### 358 359 mov f, y2 # y2 = f # CH 360 rorx $25, e, y0 # y0 = e >> 25 # S1A 361 rorx $11, e, y1 # y1 = e >> 11 # S1B 362 xor g, y2 # y2 = f^g # CH 363 364 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 365 rorx $6, e, y1 # y1 = (e >> 6) # S1 366 and e, y2 # y2 = (f^g)&e # CH 367 368 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 369 rorx $13, a, T1 # T1 = a >> 13 # S0B 370 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 371 rorx $22, a, y1 # y1 = a >> 22 # S0A 372 mov a, y3 # y3 = a # MAJA 373 374 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 375 rorx $2, a, T1 # T1 = (a >> 2) # S0 376 addl \disp(%rsp, SRND), h # h = k + w + h # -- 377 or c, y3 # y3 = a|c # MAJA 378 379 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 380 mov a, T1 # T1 = a # MAJB 381 and b, y3 # y3 = (a|c)&b # MAJA 382 and c, T1 # T1 = a&c # MAJB 383 add y0, y2 # y2 = S1 + CH # -- 384 385 386 add h, d # d = k + w + h + d # -- 387 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 388 add y1, h # h = k + w + h + S0 # -- 389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 390 391 ROTATE_ARGS 392 393################################### RND N + 1 ########################### 394 395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 396 mov f, y2 # y2 = f # CH 397 rorx $25, e, y0 # y0 = e >> 25 # S1A 398 rorx $11, e, y1 # y1 = e >> 11 # S1B 399 xor g, y2 # y2 = f^g # CH 400 401 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 402 rorx $6, e, y1 # y1 = (e >> 6) # S1 403 and e, y2 # y2 = (f^g)&e # CH 404 add y3, old_h # h = t1 + S0 + MAJ # -- 405 406 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 407 rorx $13, a, T1 # T1 = a >> 13 # S0B 408 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 409 rorx $22, a, y1 # y1 = a >> 22 # S0A 410 mov a, y3 # y3 = a # MAJA 411 412 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 413 rorx $2, a, T1 # T1 = (a >> 2) # S0 414 offset = 4*1 + \disp 415 addl offset(%rsp, SRND), h # h = k + w + h # -- 416 or c, y3 # y3 = a|c # MAJA 417 418 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 419 mov a, T1 # T1 = a # MAJB 420 and b, y3 # y3 = (a|c)&b # MAJA 421 and c, T1 # T1 = a&c # MAJB 422 add y0, y2 # y2 = S1 + CH # -- 423 424 425 add h, d # d = k + w + h + d # -- 426 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 427 add y1, h # h = k + w + h + S0 # -- 428 429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 430 431 ROTATE_ARGS 432 433################################### RND N + 2 ############################## 434 435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 436 mov f, y2 # y2 = f # CH 437 rorx $25, e, y0 # y0 = e >> 25 # S1A 438 rorx $11, e, y1 # y1 = e >> 11 # S1B 439 xor g, y2 # y2 = f^g # CH 440 441 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 442 rorx $6, e, y1 # y1 = (e >> 6) # S1 443 and e, y2 # y2 = (f^g)&e # CH 444 add y3, old_h # h = t1 + S0 + MAJ # -- 445 446 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 447 rorx $13, a, T1 # T1 = a >> 13 # S0B 448 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 449 rorx $22, a, y1 # y1 = a >> 22 # S0A 450 mov a, y3 # y3 = a # MAJA 451 452 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 453 rorx $2, a, T1 # T1 = (a >> 2) # S0 454 offset = 4*2 + \disp 455 addl offset(%rsp, SRND), h # h = k + w + h # -- 456 or c, y3 # y3 = a|c # MAJA 457 458 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 459 mov a, T1 # T1 = a # MAJB 460 and b, y3 # y3 = (a|c)&b # MAJA 461 and c, T1 # T1 = a&c # MAJB 462 add y0, y2 # y2 = S1 + CH # -- 463 464 465 add h, d # d = k + w + h + d # -- 466 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 467 add y1, h # h = k + w + h + S0 # -- 468 469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 470 471 ROTATE_ARGS 472 473################################### RND N + 3 ########################### 474 475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 476 mov f, y2 # y2 = f # CH 477 rorx $25, e, y0 # y0 = e >> 25 # S1A 478 rorx $11, e, y1 # y1 = e >> 11 # S1B 479 xor g, y2 # y2 = f^g # CH 480 481 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 482 rorx $6, e, y1 # y1 = (e >> 6) # S1 483 and e, y2 # y2 = (f^g)&e # CH 484 add y3, old_h # h = t1 + S0 + MAJ # -- 485 486 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 487 rorx $13, a, T1 # T1 = a >> 13 # S0B 488 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 489 rorx $22, a, y1 # y1 = a >> 22 # S0A 490 mov a, y3 # y3 = a # MAJA 491 492 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 493 rorx $2, a, T1 # T1 = (a >> 2) # S0 494 offset = 4*3 + \disp 495 addl offset(%rsp, SRND), h # h = k + w + h # -- 496 or c, y3 # y3 = a|c # MAJA 497 498 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 499 mov a, T1 # T1 = a # MAJB 500 and b, y3 # y3 = (a|c)&b # MAJA 501 and c, T1 # T1 = a&c # MAJB 502 add y0, y2 # y2 = S1 + CH # -- 503 504 505 add h, d # d = k + w + h + d # -- 506 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 507 add y1, h # h = k + w + h + S0 # -- 508 509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 510 511 512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 513 514 add y3, h # h = t1 + S0 + MAJ # -- 515 516 ROTATE_ARGS 517 518.endm 519 520######################################################################## 521## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) 522## arg 1 : pointer to state 523## arg 2 : pointer to input data 524## arg 3 : Num blocks 525######################################################################## 526.text 527SYM_TYPED_FUNC_START(sha256_transform_rorx) 528 pushq %rbx 529 pushq %r12 530 pushq %r13 531 pushq %r14 532 pushq %r15 533 534 push %rbp 535 mov %rsp, %rbp 536 537 subq $STACK_SIZE, %rsp 538 and $-32, %rsp # align rsp to 32 byte boundary 539 540 shl $6, NUM_BLKS # convert to bytes 541 jz done_hash 542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 543 mov NUM_BLKS, _INP_END(%rsp) 544 545 cmp NUM_BLKS, INP 546 je only_one_block 547 548 ## load initial digest 549 mov (CTX), a 550 mov 4*1(CTX), b 551 mov 4*2(CTX), c 552 mov 4*3(CTX), d 553 mov 4*4(CTX), e 554 mov 4*5(CTX), f 555 mov 4*6(CTX), g 556 mov 4*7(CTX), h 557 558 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 559 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 560 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 561 562 mov CTX, _CTX(%rsp) 563 564loop0: 565 ## Load first 16 dwords from two blocks 566 VMOVDQ 0*32(INP),XTMP0 567 VMOVDQ 1*32(INP),XTMP1 568 VMOVDQ 2*32(INP),XTMP2 569 VMOVDQ 3*32(INP),XTMP3 570 571 ## byte swap data 572 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 573 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 574 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 575 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 576 577 ## transpose data into high/low halves 578 vperm2i128 $0x20, XTMP2, XTMP0, X0 579 vperm2i128 $0x31, XTMP2, XTMP0, X1 580 vperm2i128 $0x20, XTMP3, XTMP1, X2 581 vperm2i128 $0x31, XTMP3, XTMP1, X3 582 583last_block_enter: 584 add $64, INP 585 mov INP, _INP(%rsp) 586 587 ## schedule 48 input dwords, by doing 3 rounds of 12 each 588 xor SRND, SRND 589 590.align 16 591loop1: 592 vpaddd K256+0*32(SRND), X0, XFER 593 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 594 FOUR_ROUNDS_AND_SCHED _XFER + 0*32 595 596 vpaddd K256+1*32(SRND), X0, XFER 597 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 598 FOUR_ROUNDS_AND_SCHED _XFER + 1*32 599 600 vpaddd K256+2*32(SRND), X0, XFER 601 vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 602 FOUR_ROUNDS_AND_SCHED _XFER + 2*32 603 604 vpaddd K256+3*32(SRND), X0, XFER 605 vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 606 FOUR_ROUNDS_AND_SCHED _XFER + 3*32 607 608 add $4*32, SRND 609 cmp $3*4*32, SRND 610 jb loop1 611 612loop2: 613 ## Do last 16 rounds with no scheduling 614 vpaddd K256+0*32(SRND), X0, XFER 615 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 616 DO_4ROUNDS _XFER + 0*32 617 618 vpaddd K256+1*32(SRND), X1, XFER 619 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 620 DO_4ROUNDS _XFER + 1*32 621 add $2*32, SRND 622 623 vmovdqa X2, X0 624 vmovdqa X3, X1 625 626 cmp $4*4*32, SRND 627 jb loop2 628 629 mov _CTX(%rsp), CTX 630 mov _INP(%rsp), INP 631 632 addm (4*0)(CTX),a 633 addm (4*1)(CTX),b 634 addm (4*2)(CTX),c 635 addm (4*3)(CTX),d 636 addm (4*4)(CTX),e 637 addm (4*5)(CTX),f 638 addm (4*6)(CTX),g 639 addm (4*7)(CTX),h 640 641 cmp _INP_END(%rsp), INP 642 ja done_hash 643 644 #### Do second block using previously scheduled results 645 xor SRND, SRND 646.align 16 647loop3: 648 DO_4ROUNDS _XFER + 0*32 + 16 649 DO_4ROUNDS _XFER + 1*32 + 16 650 add $2*32, SRND 651 cmp $4*4*32, SRND 652 jb loop3 653 654 mov _CTX(%rsp), CTX 655 mov _INP(%rsp), INP 656 add $64, INP 657 658 addm (4*0)(CTX),a 659 addm (4*1)(CTX),b 660 addm (4*2)(CTX),c 661 addm (4*3)(CTX),d 662 addm (4*4)(CTX),e 663 addm (4*5)(CTX),f 664 addm (4*6)(CTX),g 665 addm (4*7)(CTX),h 666 667 cmp _INP_END(%rsp), INP 668 jb loop0 669 ja done_hash 670 671do_last_block: 672 VMOVDQ 0*16(INP),XWORD0 673 VMOVDQ 1*16(INP),XWORD1 674 VMOVDQ 2*16(INP),XWORD2 675 VMOVDQ 3*16(INP),XWORD3 676 677 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 678 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 679 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 680 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 681 682 jmp last_block_enter 683 684only_one_block: 685 686 ## load initial digest 687 mov (4*0)(CTX),a 688 mov (4*1)(CTX),b 689 mov (4*2)(CTX),c 690 mov (4*3)(CTX),d 691 mov (4*4)(CTX),e 692 mov (4*5)(CTX),f 693 mov (4*6)(CTX),g 694 mov (4*7)(CTX),h 695 696 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 697 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 698 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 699 700 mov CTX, _CTX(%rsp) 701 jmp do_last_block 702 703done_hash: 704 705 mov %rbp, %rsp 706 pop %rbp 707 708 popq %r15 709 popq %r14 710 popq %r13 711 popq %r12 712 popq %rbx 713 RET 714SYM_FUNC_END(sha256_transform_rorx) 715 716.section .rodata.cst512.K256, "aM", @progbits, 512 717.align 64 718K256: 719 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 720 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 721 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 722 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 723 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 724 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 725 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 726 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 727 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 728 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 729 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 730 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 731 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 732 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 733 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 734 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 735 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 736 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 737 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 738 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 739 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 740 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 741 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 742 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 743 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 744 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 745 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 746 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 747 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 748 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 749 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 750 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 751 752.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 753.align 32 754PSHUFFLE_BYTE_FLIP_MASK: 755 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 756 757# shuffle xBxA -> 00BA 758.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 759.align 32 760_SHUF_00BA: 761 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 762 763# shuffle xDxC -> DC00 764.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 765.align 32 766_SHUF_DC00: 767 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 768