1######################################################################## 2# Implement fast SHA-512 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# David Cote <david.m.cote@intel.com> 10# Tim Chen <tim.c.chen@linux.intel.com> 11# 12# This software is available to you under a choice of one of two 13# licenses. You may choose to be licensed under the terms of the GNU 14# General Public License (GPL) Version 2, available from the file 15# COPYING in the main directory of this source tree, or the 16# OpenIB.org BSD license below: 17# 18# Redistribution and use in source and binary forms, with or 19# without modification, are permitted provided that the following 20# conditions are met: 21# 22# - Redistributions of source code must retain the above 23# copyright notice, this list of conditions and the following 24# disclaimer. 25# 26# - Redistributions in binary form must reproduce the above 27# copyright notice, this list of conditions and the following 28# disclaimer in the documentation and/or other materials 29# provided with the distribution. 30# 31# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 32# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 35# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 37# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38# SOFTWARE. 39# 40######################################################################## 41# 42# This code is described in an Intel White-Paper: 43# "Fast SHA-512 Implementations on Intel Architecture Processors" 44# 45# To find it, surf to http://www.intel.com/p/en_US/embedded 46# and search for that title. 47# 48######################################################################## 49# This code schedules 1 blocks at a time, with 4 lanes per block 50######################################################################## 51 52#include <linux/linkage.h> 53 54.text 55 56# Virtual Registers 57Y_0 = %ymm4 58Y_1 = %ymm5 59Y_2 = %ymm6 60Y_3 = %ymm7 61 62YTMP0 = %ymm0 63YTMP1 = %ymm1 64YTMP2 = %ymm2 65YTMP3 = %ymm3 66YTMP4 = %ymm8 67XFER = YTMP0 68 69BYTE_FLIP_MASK = %ymm9 70 71# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 72CTX1 = %rdi 73CTX2 = %r12 74# 2nd arg 75INP = %rsi 76# 3rd arg 77NUM_BLKS = %rdx 78 79c = %rcx 80d = %r8 81e = %rdx 82y3 = %rsi 83 84TBL = %rdi # clobbers CTX1 85 86a = %rax 87b = %rbx 88 89f = %r9 90g = %r10 91h = %r11 92old_h = %r11 93 94T1 = %r12 # clobbers CTX2 95y0 = %r13 96y1 = %r14 97y2 = %r15 98 99# Local variables (stack frame) 100XFER_SIZE = 4*8 101SRND_SIZE = 1*8 102INP_SIZE = 1*8 103INPEND_SIZE = 1*8 104CTX_SIZE = 1*8 105RSPSAVE_SIZE = 1*8 106GPRSAVE_SIZE = 5*8 107 108frame_XFER = 0 109frame_SRND = frame_XFER + XFER_SIZE 110frame_INP = frame_SRND + SRND_SIZE 111frame_INPEND = frame_INP + INP_SIZE 112frame_CTX = frame_INPEND + INPEND_SIZE 113frame_RSPSAVE = frame_CTX + CTX_SIZE 114frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE 115frame_size = frame_GPRSAVE + GPRSAVE_SIZE 116 117## assume buffers not aligned 118#define VMOVDQ vmovdqu 119 120# addm [mem], reg 121# Add reg to mem using reg-mem add and store 122.macro addm p1 p2 123 add \p1, \p2 124 mov \p2, \p1 125.endm 126 127 128# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask 129# Load ymm with mem and byte swap each dword 130.macro COPY_YMM_AND_BSWAP p1 p2 p3 131 VMOVDQ \p2, \p1 132 vpshufb \p3, \p1, \p1 133.endm 134# rotate_Ys 135# Rotate values of symbols Y0...Y3 136.macro rotate_Ys 137 Y_ = Y_0 138 Y_0 = Y_1 139 Y_1 = Y_2 140 Y_2 = Y_3 141 Y_3 = Y_ 142.endm 143 144# RotateState 145.macro RotateState 146 # Rotate symbols a..h right 147 old_h = h 148 TMP_ = h 149 h = g 150 g = f 151 f = e 152 e = d 153 d = c 154 c = b 155 b = a 156 a = TMP_ 157.endm 158 159# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL 160# YDST = {YSRC1, YSRC2} >> RVAL*8 161.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL 162 vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} 163 vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 164.endm 165 166.macro FOUR_ROUNDS_AND_SCHED 167################################### RND N + 0 ######################################### 168 169 # Extract w[t-7] 170 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] 171 # Calculate w[t-16] + w[t-7] 172 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] 173 # Extract w[t-15] 174 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] 175 176 # Calculate sigma0 177 178 # Calculate w[t-15] ror 1 179 vpsrlq $1, YTMP1, YTMP2 180 vpsllq $(64-1), YTMP1, YTMP3 181 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 182 # Calculate w[t-15] shr 7 183 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 184 185 mov a, y3 # y3 = a # MAJA 186 rorx $41, e, y0 # y0 = e >> 41 # S1A 187 rorx $18, e, y1 # y1 = e >> 18 # S1B 188 add frame_XFER(%rsp),h # h = k + w + h # -- 189 or c, y3 # y3 = a|c # MAJA 190 mov f, y2 # y2 = f # CH 191 rorx $34, a, T1 # T1 = a >> 34 # S0B 192 193 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 194 xor g, y2 # y2 = f^g # CH 195 rorx $14, e, y1 # y1 = (e >> 14) # S1 196 197 and e, y2 # y2 = (f^g)&e # CH 198 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 199 rorx $39, a, y1 # y1 = a >> 39 # S0A 200 add h, d # d = k + w + h + d # -- 201 202 and b, y3 # y3 = (a|c)&b # MAJA 203 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 204 rorx $28, a, T1 # T1 = (a >> 28) # S0 205 206 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 207 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 208 mov a, T1 # T1 = a # MAJB 209 and c, T1 # T1 = a&c # MAJB 210 211 add y0, y2 # y2 = S1 + CH # -- 212 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 213 add y1, h # h = k + w + h + S0 # -- 214 215 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 216 217 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 218 add y3, h # h = t1 + S0 + MAJ # -- 219 220 RotateState 221 222################################### RND N + 1 ######################################### 223 224 # Calculate w[t-15] ror 8 225 vpsrlq $8, YTMP1, YTMP2 226 vpsllq $(64-8), YTMP1, YTMP1 227 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 228 # XOR the three components 229 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 230 vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 231 232 233 # Add three components, w[t-16], w[t-7] and sigma0 234 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 235 # Move to appropriate lanes for calculating w[16] and w[17] 236 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} 237 # Move to appropriate lanes for calculating w[18] and w[19] 238 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} 239 240 # Calculate w[16] and w[17] in both 128 bit lanes 241 242 # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes 243 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} 244 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} 245 246 247 mov a, y3 # y3 = a # MAJA 248 rorx $41, e, y0 # y0 = e >> 41 # S1A 249 rorx $18, e, y1 # y1 = e >> 18 # S1B 250 add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- 251 or c, y3 # y3 = a|c # MAJA 252 253 254 mov f, y2 # y2 = f # CH 255 rorx $34, a, T1 # T1 = a >> 34 # S0B 256 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 257 xor g, y2 # y2 = f^g # CH 258 259 260 rorx $14, e, y1 # y1 = (e >> 14) # S1 261 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 262 rorx $39, a, y1 # y1 = a >> 39 # S0A 263 and e, y2 # y2 = (f^g)&e # CH 264 add h, d # d = k + w + h + d # -- 265 266 and b, y3 # y3 = (a|c)&b # MAJA 267 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 268 269 rorx $28, a, T1 # T1 = (a >> 28) # S0 270 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 271 272 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 273 mov a, T1 # T1 = a # MAJB 274 and c, T1 # T1 = a&c # MAJB 275 add y0, y2 # y2 = S1 + CH # -- 276 277 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 278 add y1, h # h = k + w + h + S0 # -- 279 280 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 281 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 282 add y3, h # h = t1 + S0 + MAJ # -- 283 284 RotateState 285 286 287################################### RND N + 2 ######################################### 288 289 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} 290 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} 291 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} 292 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} 293 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} 294 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} 295 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} 296 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 297 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} 298 299 # Add sigma1 to the other compunents to get w[16] and w[17] 300 vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} 301 302 # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane 303 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} 304 305 mov a, y3 # y3 = a # MAJA 306 rorx $41, e, y0 # y0 = e >> 41 # S1A 307 add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- 308 309 rorx $18, e, y1 # y1 = e >> 18 # S1B 310 or c, y3 # y3 = a|c # MAJA 311 mov f, y2 # y2 = f # CH 312 xor g, y2 # y2 = f^g # CH 313 314 rorx $34, a, T1 # T1 = a >> 34 # S0B 315 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 316 and e, y2 # y2 = (f^g)&e # CH 317 318 rorx $14, e, y1 # y1 = (e >> 14) # S1 319 add h, d # d = k + w + h + d # -- 320 and b, y3 # y3 = (a|c)&b # MAJA 321 322 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 323 rorx $39, a, y1 # y1 = a >> 39 # S0A 324 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 325 326 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 327 rorx $28, a, T1 # T1 = (a >> 28) # S0 328 329 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 330 mov a, T1 # T1 = a # MAJB 331 and c, T1 # T1 = a&c # MAJB 332 add y0, y2 # y2 = S1 + CH # -- 333 334 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 335 add y1, h # h = k + w + h + S0 # -- 336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 337 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 338 339 add y3, h # h = t1 + S0 + MAJ # -- 340 341 RotateState 342 343################################### RND N + 3 ######################################### 344 345 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} 346 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} 347 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} 348 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} 349 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} 350 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} 351 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} 352 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 353 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} 354 355 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] 356 # to newly calculated sigma1 to get w[18] and w[19] 357 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} 358 359 # Form w[19, w[18], w17], w[16] 360 vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} 361 362 mov a, y3 # y3 = a # MAJA 363 rorx $41, e, y0 # y0 = e >> 41 # S1A 364 rorx $18, e, y1 # y1 = e >> 18 # S1B 365 add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- 366 or c, y3 # y3 = a|c # MAJA 367 368 369 mov f, y2 # y2 = f # CH 370 rorx $34, a, T1 # T1 = a >> 34 # S0B 371 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 372 xor g, y2 # y2 = f^g # CH 373 374 375 rorx $14, e, y1 # y1 = (e >> 14) # S1 376 and e, y2 # y2 = (f^g)&e # CH 377 add h, d # d = k + w + h + d # -- 378 and b, y3 # y3 = (a|c)&b # MAJA 379 380 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 381 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 382 383 rorx $39, a, y1 # y1 = a >> 39 # S0A 384 add y0, y2 # y2 = S1 + CH # -- 385 386 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 387 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 388 389 rorx $28, a, T1 # T1 = (a >> 28) # S0 390 391 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 392 mov a, T1 # T1 = a # MAJB 393 and c, T1 # T1 = a&c # MAJB 394 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 395 396 add y1, h # h = k + w + h + S0 # -- 397 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 398 add y3, h # h = t1 + S0 + MAJ # -- 399 400 RotateState 401 402 rotate_Ys 403.endm 404 405.macro DO_4ROUNDS 406 407################################### RND N + 0 ######################################### 408 409 mov f, y2 # y2 = f # CH 410 rorx $41, e, y0 # y0 = e >> 41 # S1A 411 rorx $18, e, y1 # y1 = e >> 18 # S1B 412 xor g, y2 # y2 = f^g # CH 413 414 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 415 rorx $14, e, y1 # y1 = (e >> 14) # S1 416 and e, y2 # y2 = (f^g)&e # CH 417 418 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 419 rorx $34, a, T1 # T1 = a >> 34 # S0B 420 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 421 rorx $39, a, y1 # y1 = a >> 39 # S0A 422 mov a, y3 # y3 = a # MAJA 423 424 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 425 rorx $28, a, T1 # T1 = (a >> 28) # S0 426 add frame_XFER(%rsp), h # h = k + w + h # -- 427 or c, y3 # y3 = a|c # MAJA 428 429 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 430 mov a, T1 # T1 = a # MAJB 431 and b, y3 # y3 = (a|c)&b # MAJA 432 and c, T1 # T1 = a&c # MAJB 433 add y0, y2 # y2 = S1 + CH # -- 434 435 add h, d # d = k + w + h + d # -- 436 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 437 add y1, h # h = k + w + h + S0 # -- 438 439 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 440 441 RotateState 442 443################################### RND N + 1 ######################################### 444 445 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 446 mov f, y2 # y2 = f # CH 447 rorx $41, e, y0 # y0 = e >> 41 # S1A 448 rorx $18, e, y1 # y1 = e >> 18 # S1B 449 xor g, y2 # y2 = f^g # CH 450 451 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 452 rorx $14, e, y1 # y1 = (e >> 14) # S1 453 and e, y2 # y2 = (f^g)&e # CH 454 add y3, old_h # h = t1 + S0 + MAJ # -- 455 456 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 457 rorx $34, a, T1 # T1 = a >> 34 # S0B 458 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 459 rorx $39, a, y1 # y1 = a >> 39 # S0A 460 mov a, y3 # y3 = a # MAJA 461 462 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 463 rorx $28, a, T1 # T1 = (a >> 28) # S0 464 add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- 465 or c, y3 # y3 = a|c # MAJA 466 467 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 468 mov a, T1 # T1 = a # MAJB 469 and b, y3 # y3 = (a|c)&b # MAJA 470 and c, T1 # T1 = a&c # MAJB 471 add y0, y2 # y2 = S1 + CH # -- 472 473 add h, d # d = k + w + h + d # -- 474 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 475 add y1, h # h = k + w + h + S0 # -- 476 477 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 478 479 RotateState 480 481################################### RND N + 2 ######################################### 482 483 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 484 mov f, y2 # y2 = f # CH 485 rorx $41, e, y0 # y0 = e >> 41 # S1A 486 rorx $18, e, y1 # y1 = e >> 18 # S1B 487 xor g, y2 # y2 = f^g # CH 488 489 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 490 rorx $14, e, y1 # y1 = (e >> 14) # S1 491 and e, y2 # y2 = (f^g)&e # CH 492 add y3, old_h # h = t1 + S0 + MAJ # -- 493 494 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 495 rorx $34, a, T1 # T1 = a >> 34 # S0B 496 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 497 rorx $39, a, y1 # y1 = a >> 39 # S0A 498 mov a, y3 # y3 = a # MAJA 499 500 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 501 rorx $28, a, T1 # T1 = (a >> 28) # S0 502 add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- 503 or c, y3 # y3 = a|c # MAJA 504 505 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 506 mov a, T1 # T1 = a # MAJB 507 and b, y3 # y3 = (a|c)&b # MAJA 508 and c, T1 # T1 = a&c # MAJB 509 add y0, y2 # y2 = S1 + CH # -- 510 511 add h, d # d = k + w + h + d # -- 512 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 513 add y1, h # h = k + w + h + S0 # -- 514 515 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 516 517 RotateState 518 519################################### RND N + 3 ######################################### 520 521 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 522 mov f, y2 # y2 = f # CH 523 rorx $41, e, y0 # y0 = e >> 41 # S1A 524 rorx $18, e, y1 # y1 = e >> 18 # S1B 525 xor g, y2 # y2 = f^g # CH 526 527 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 528 rorx $14, e, y1 # y1 = (e >> 14) # S1 529 and e, y2 # y2 = (f^g)&e # CH 530 add y3, old_h # h = t1 + S0 + MAJ # -- 531 532 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 533 rorx $34, a, T1 # T1 = a >> 34 # S0B 534 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 535 rorx $39, a, y1 # y1 = a >> 39 # S0A 536 mov a, y3 # y3 = a # MAJA 537 538 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 539 rorx $28, a, T1 # T1 = (a >> 28) # S0 540 add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- 541 or c, y3 # y3 = a|c # MAJA 542 543 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 544 mov a, T1 # T1 = a # MAJB 545 and b, y3 # y3 = (a|c)&b # MAJA 546 and c, T1 # T1 = a&c # MAJB 547 add y0, y2 # y2 = S1 + CH # -- 548 549 550 add h, d # d = k + w + h + d # -- 551 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 552 add y1, h # h = k + w + h + S0 # -- 553 554 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 555 556 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 557 558 add y3, h # h = t1 + S0 + MAJ # -- 559 560 RotateState 561 562.endm 563 564######################################################################## 565# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) 566# Purpose: Updates the SHA512 digest stored at "state" with the message 567# stored in "data". 568# The size of the message pointed to by "data" must be an integer multiple 569# of SHA512 message blocks. 570# "blocks" is the message length in SHA512 blocks 571######################################################################## 572SYM_FUNC_START(sha512_transform_rorx) 573 # Allocate Stack Space 574 mov %rsp, %rax 575 sub $frame_size, %rsp 576 and $~(0x20 - 1), %rsp 577 mov %rax, frame_RSPSAVE(%rsp) 578 579 # Save GPRs 580 mov %rbx, 8*0+frame_GPRSAVE(%rsp) 581 mov %r12, 8*1+frame_GPRSAVE(%rsp) 582 mov %r13, 8*2+frame_GPRSAVE(%rsp) 583 mov %r14, 8*3+frame_GPRSAVE(%rsp) 584 mov %r15, 8*4+frame_GPRSAVE(%rsp) 585 586 shl $7, NUM_BLKS # convert to bytes 587 jz done_hash 588 add INP, NUM_BLKS # pointer to end of data 589 mov NUM_BLKS, frame_INPEND(%rsp) 590 591 ## load initial digest 592 mov 8*0(CTX1), a 593 mov 8*1(CTX1), b 594 mov 8*2(CTX1), c 595 mov 8*3(CTX1), d 596 mov 8*4(CTX1), e 597 mov 8*5(CTX1), f 598 mov 8*6(CTX1), g 599 mov 8*7(CTX1), h 600 601 # save %rdi (CTX) before it gets clobbered 602 mov %rdi, frame_CTX(%rsp) 603 604 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 605 606loop0: 607 lea K512(%rip), TBL 608 609 ## byte swap first 16 dwords 610 COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK 611 COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK 612 COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK 613 COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK 614 615 mov INP, frame_INP(%rsp) 616 617 ## schedule 64 input dwords, by doing 12 rounds of 4 each 618 movq $4, frame_SRND(%rsp) 619 620.align 16 621loop1: 622 vpaddq (TBL), Y_0, XFER 623 vmovdqa XFER, frame_XFER(%rsp) 624 FOUR_ROUNDS_AND_SCHED 625 626 vpaddq 1*32(TBL), Y_0, XFER 627 vmovdqa XFER, frame_XFER(%rsp) 628 FOUR_ROUNDS_AND_SCHED 629 630 vpaddq 2*32(TBL), Y_0, XFER 631 vmovdqa XFER, frame_XFER(%rsp) 632 FOUR_ROUNDS_AND_SCHED 633 634 vpaddq 3*32(TBL), Y_0, XFER 635 vmovdqa XFER, frame_XFER(%rsp) 636 add $(4*32), TBL 637 FOUR_ROUNDS_AND_SCHED 638 639 subq $1, frame_SRND(%rsp) 640 jne loop1 641 642 movq $2, frame_SRND(%rsp) 643loop2: 644 vpaddq (TBL), Y_0, XFER 645 vmovdqa XFER, frame_XFER(%rsp) 646 DO_4ROUNDS 647 vpaddq 1*32(TBL), Y_1, XFER 648 vmovdqa XFER, frame_XFER(%rsp) 649 add $(2*32), TBL 650 DO_4ROUNDS 651 652 vmovdqa Y_2, Y_0 653 vmovdqa Y_3, Y_1 654 655 subq $1, frame_SRND(%rsp) 656 jne loop2 657 658 mov frame_CTX(%rsp), CTX2 659 addm 8*0(CTX2), a 660 addm 8*1(CTX2), b 661 addm 8*2(CTX2), c 662 addm 8*3(CTX2), d 663 addm 8*4(CTX2), e 664 addm 8*5(CTX2), f 665 addm 8*6(CTX2), g 666 addm 8*7(CTX2), h 667 668 mov frame_INP(%rsp), INP 669 add $128, INP 670 cmp frame_INPEND(%rsp), INP 671 jne loop0 672 673done_hash: 674 675# Restore GPRs 676 mov 8*0+frame_GPRSAVE(%rsp), %rbx 677 mov 8*1+frame_GPRSAVE(%rsp), %r12 678 mov 8*2+frame_GPRSAVE(%rsp), %r13 679 mov 8*3+frame_GPRSAVE(%rsp), %r14 680 mov 8*4+frame_GPRSAVE(%rsp), %r15 681 682 # Restore Stack Pointer 683 mov frame_RSPSAVE(%rsp), %rsp 684 ret 685SYM_FUNC_END(sha512_transform_rorx) 686 687######################################################################## 688### Binary Data 689 690 691# Mergeable 640-byte rodata section. This allows linker to merge the table 692# with other, exactly the same 640-byte fragment of another rodata section 693# (if such section exists). 694.section .rodata.cst640.K512, "aM", @progbits, 640 695.align 64 696# K[t] used in SHA512 hashing 697K512: 698 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 699 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 700 .quad 0x3956c25bf348b538,0x59f111f1b605d019 701 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 702 .quad 0xd807aa98a3030242,0x12835b0145706fbe 703 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 704 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 705 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 706 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 707 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 708 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 709 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 710 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 711 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 712 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 713 .quad 0x06ca6351e003826f,0x142929670a0e6e70 714 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 715 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 716 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 717 .quad 0x81c2c92e47edaee6,0x92722c851482353b 718 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 719 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 720 .quad 0xd192e819d6ef5218,0xd69906245565a910 721 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 722 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 723 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 724 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 725 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 726 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 727 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 728 .quad 0x90befffa23631e28,0xa4506cebde82bde9 729 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 730 .quad 0xca273eceea26619c,0xd186b8c721c0c207 731 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 732 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 733 .quad 0x113f9804bef90dae,0x1b710b35131c471b 734 .quad 0x28db77f523047d84,0x32caab7b40c72493 735 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 736 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 737 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 738 739.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 740.align 32 741# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 742PSHUFFLE_BYTE_FLIP_MASK: 743 .octa 0x08090a0b0c0d0e0f0001020304050607 744 .octa 0x18191a1b1c1d1e1f1011121314151617 745 746.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 747.align 32 748MASK_YMM_LO: 749 .octa 0x00000000000000000000000000000000 750 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 751