1######################################################################## 2# Implement fast SHA-512 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# David Cote <david.m.cote@intel.com> 10# Tim Chen <tim.c.chen@linux.intel.com> 11# 12# This software is available to you under a choice of one of two 13# licenses. You may choose to be licensed under the terms of the GNU 14# General Public License (GPL) Version 2, available from the file 15# COPYING in the main directory of this source tree, or the 16# OpenIB.org BSD license below: 17# 18# Redistribution and use in source and binary forms, with or 19# without modification, are permitted provided that the following 20# conditions are met: 21# 22# - Redistributions of source code must retain the above 23# copyright notice, this list of conditions and the following 24# disclaimer. 25# 26# - Redistributions in binary form must reproduce the above 27# copyright notice, this list of conditions and the following 28# disclaimer in the documentation and/or other materials 29# provided with the distribution. 30# 31# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 32# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 35# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 37# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38# SOFTWARE. 39# 40######################################################################## 41# 42# This code is described in an Intel White-Paper: 43# "Fast SHA-512 Implementations on Intel Architecture Processors" 44# 45# To find it, surf to http://www.intel.com/p/en_US/embedded 46# and search for that title. 47# 48######################################################################## 49# This code schedules 1 blocks at a time, with 4 lanes per block 50######################################################################## 51 52#ifdef CONFIG_AS_AVX2 53#include <linux/linkage.h> 54 55.text 56 57# Virtual Registers 58Y_0 = %ymm4 59Y_1 = %ymm5 60Y_2 = %ymm6 61Y_3 = %ymm7 62 63YTMP0 = %ymm0 64YTMP1 = %ymm1 65YTMP2 = %ymm2 66YTMP3 = %ymm3 67YTMP4 = %ymm8 68XFER = YTMP0 69 70BYTE_FLIP_MASK = %ymm9 71 72# 1st arg 73CTX = %rdi 74# 2nd arg 75INP = %rsi 76# 3rd arg 77NUM_BLKS = %rdx 78 79c = %rcx 80d = %r8 81e = %rdx 82y3 = %rsi 83 84TBL = %rbp 85 86a = %rax 87b = %rbx 88 89f = %r9 90g = %r10 91h = %r11 92old_h = %r11 93 94T1 = %r12 95y0 = %r13 96y1 = %r14 97y2 = %r15 98 99y4 = %r12 100 101# Local variables (stack frame) 102XFER_SIZE = 4*8 103SRND_SIZE = 1*8 104INP_SIZE = 1*8 105INPEND_SIZE = 1*8 106RSPSAVE_SIZE = 1*8 107GPRSAVE_SIZE = 6*8 108 109frame_XFER = 0 110frame_SRND = frame_XFER + XFER_SIZE 111frame_INP = frame_SRND + SRND_SIZE 112frame_INPEND = frame_INP + INP_SIZE 113frame_RSPSAVE = frame_INPEND + INPEND_SIZE 114frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE 115frame_size = frame_GPRSAVE + GPRSAVE_SIZE 116 117## assume buffers not aligned 118#define VMOVDQ vmovdqu 119 120# addm [mem], reg 121# Add reg to mem using reg-mem add and store 122.macro addm p1 p2 123 add \p1, \p2 124 mov \p2, \p1 125.endm 126 127 128# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask 129# Load ymm with mem and byte swap each dword 130.macro COPY_YMM_AND_BSWAP p1 p2 p3 131 VMOVDQ \p2, \p1 132 vpshufb \p3, \p1, \p1 133.endm 134# rotate_Ys 135# Rotate values of symbols Y0...Y3 136.macro rotate_Ys 137 Y_ = Y_0 138 Y_0 = Y_1 139 Y_1 = Y_2 140 Y_2 = Y_3 141 Y_3 = Y_ 142.endm 143 144# RotateState 145.macro RotateState 146 # Rotate symbols a..h right 147 old_h = h 148 TMP_ = h 149 h = g 150 g = f 151 f = e 152 e = d 153 d = c 154 c = b 155 b = a 156 a = TMP_ 157.endm 158 159# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL 160# YDST = {YSRC1, YSRC2} >> RVAL*8 161.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL 162 vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} 163 vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 164.endm 165 166.macro FOUR_ROUNDS_AND_SCHED 167################################### RND N + 0 ######################################### 168 169 # Extract w[t-7] 170 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] 171 # Calculate w[t-16] + w[t-7] 172 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] 173 # Extract w[t-15] 174 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] 175 176 # Calculate sigma0 177 178 # Calculate w[t-15] ror 1 179 vpsrlq $1, YTMP1, YTMP2 180 vpsllq $(64-1), YTMP1, YTMP3 181 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 182 # Calculate w[t-15] shr 7 183 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 184 185 mov a, y3 # y3 = a # MAJA 186 rorx $41, e, y0 # y0 = e >> 41 # S1A 187 rorx $18, e, y1 # y1 = e >> 18 # S1B 188 add frame_XFER(%rsp),h # h = k + w + h # -- 189 or c, y3 # y3 = a|c # MAJA 190 mov f, y2 # y2 = f # CH 191 rorx $34, a, T1 # T1 = a >> 34 # S0B 192 193 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 194 xor g, y2 # y2 = f^g # CH 195 rorx $14, e, y1 # y1 = (e >> 14) # S1 196 197 and e, y2 # y2 = (f^g)&e # CH 198 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 199 rorx $39, a, y1 # y1 = a >> 39 # S0A 200 add h, d # d = k + w + h + d # -- 201 202 and b, y3 # y3 = (a|c)&b # MAJA 203 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 204 rorx $28, a, T1 # T1 = (a >> 28) # S0 205 206 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 207 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 208 mov a, T1 # T1 = a # MAJB 209 and c, T1 # T1 = a&c # MAJB 210 211 add y0, y2 # y2 = S1 + CH # -- 212 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 213 add y1, h # h = k + w + h + S0 # -- 214 215 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 216 217 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 218 add y3, h # h = t1 + S0 + MAJ # -- 219 220 RotateState 221 222################################### RND N + 1 ######################################### 223 224 # Calculate w[t-15] ror 8 225 vpsrlq $8, YTMP1, YTMP2 226 vpsllq $(64-8), YTMP1, YTMP1 227 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 228 # XOR the three components 229 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 230 vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 231 232 233 # Add three components, w[t-16], w[t-7] and sigma0 234 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 235 # Move to appropriate lanes for calculating w[16] and w[17] 236 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} 237 # Move to appropriate lanes for calculating w[18] and w[19] 238 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} 239 240 # Calculate w[16] and w[17] in both 128 bit lanes 241 242 # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes 243 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} 244 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} 245 246 247 mov a, y3 # y3 = a # MAJA 248 rorx $41, e, y0 # y0 = e >> 41 # S1A 249 rorx $18, e, y1 # y1 = e >> 18 # S1B 250 add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- 251 or c, y3 # y3 = a|c # MAJA 252 253 254 mov f, y2 # y2 = f # CH 255 rorx $34, a, T1 # T1 = a >> 34 # S0B 256 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 257 xor g, y2 # y2 = f^g # CH 258 259 260 rorx $14, e, y1 # y1 = (e >> 14) # S1 261 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 262 rorx $39, a, y1 # y1 = a >> 39 # S0A 263 and e, y2 # y2 = (f^g)&e # CH 264 add h, d # d = k + w + h + d # -- 265 266 and b, y3 # y3 = (a|c)&b # MAJA 267 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 268 269 rorx $28, a, T1 # T1 = (a >> 28) # S0 270 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 271 272 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 273 mov a, T1 # T1 = a # MAJB 274 and c, T1 # T1 = a&c # MAJB 275 add y0, y2 # y2 = S1 + CH # -- 276 277 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 278 add y1, h # h = k + w + h + S0 # -- 279 280 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 281 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 282 add y3, h # h = t1 + S0 + MAJ # -- 283 284 RotateState 285 286 287################################### RND N + 2 ######################################### 288 289 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} 290 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} 291 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} 292 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} 293 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} 294 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} 295 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} 296 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 297 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} 298 299 # Add sigma1 to the other compunents to get w[16] and w[17] 300 vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} 301 302 # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane 303 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} 304 305 mov a, y3 # y3 = a # MAJA 306 rorx $41, e, y0 # y0 = e >> 41 # S1A 307 add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- 308 309 rorx $18, e, y1 # y1 = e >> 18 # S1B 310 or c, y3 # y3 = a|c # MAJA 311 mov f, y2 # y2 = f # CH 312 xor g, y2 # y2 = f^g # CH 313 314 rorx $34, a, T1 # T1 = a >> 34 # S0B 315 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 316 and e, y2 # y2 = (f^g)&e # CH 317 318 rorx $14, e, y1 # y1 = (e >> 14) # S1 319 add h, d # d = k + w + h + d # -- 320 and b, y3 # y3 = (a|c)&b # MAJA 321 322 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 323 rorx $39, a, y1 # y1 = a >> 39 # S0A 324 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 325 326 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 327 rorx $28, a, T1 # T1 = (a >> 28) # S0 328 329 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 330 mov a, T1 # T1 = a # MAJB 331 and c, T1 # T1 = a&c # MAJB 332 add y0, y2 # y2 = S1 + CH # -- 333 334 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 335 add y1, h # h = k + w + h + S0 # -- 336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 337 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 338 339 add y3, h # h = t1 + S0 + MAJ # -- 340 341 RotateState 342 343################################### RND N + 3 ######################################### 344 345 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} 346 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} 347 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} 348 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} 349 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} 350 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} 351 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} 352 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 353 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} 354 355 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] 356 # to newly calculated sigma1 to get w[18] and w[19] 357 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} 358 359 # Form w[19, w[18], w17], w[16] 360 vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} 361 362 mov a, y3 # y3 = a # MAJA 363 rorx $41, e, y0 # y0 = e >> 41 # S1A 364 rorx $18, e, y1 # y1 = e >> 18 # S1B 365 add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- 366 or c, y3 # y3 = a|c # MAJA 367 368 369 mov f, y2 # y2 = f # CH 370 rorx $34, a, T1 # T1 = a >> 34 # S0B 371 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 372 xor g, y2 # y2 = f^g # CH 373 374 375 rorx $14, e, y1 # y1 = (e >> 14) # S1 376 and e, y2 # y2 = (f^g)&e # CH 377 add h, d # d = k + w + h + d # -- 378 and b, y3 # y3 = (a|c)&b # MAJA 379 380 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 381 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 382 383 rorx $39, a, y1 # y1 = a >> 39 # S0A 384 add y0, y2 # y2 = S1 + CH # -- 385 386 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 387 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 388 389 rorx $28, a, T1 # T1 = (a >> 28) # S0 390 391 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 392 mov a, T1 # T1 = a # MAJB 393 and c, T1 # T1 = a&c # MAJB 394 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 395 396 add y1, h # h = k + w + h + S0 # -- 397 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 398 add y3, h # h = t1 + S0 + MAJ # -- 399 400 RotateState 401 402 rotate_Ys 403.endm 404 405.macro DO_4ROUNDS 406 407################################### RND N + 0 ######################################### 408 409 mov f, y2 # y2 = f # CH 410 rorx $41, e, y0 # y0 = e >> 41 # S1A 411 rorx $18, e, y1 # y1 = e >> 18 # S1B 412 xor g, y2 # y2 = f^g # CH 413 414 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 415 rorx $14, e, y1 # y1 = (e >> 14) # S1 416 and e, y2 # y2 = (f^g)&e # CH 417 418 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 419 rorx $34, a, T1 # T1 = a >> 34 # S0B 420 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 421 rorx $39, a, y1 # y1 = a >> 39 # S0A 422 mov a, y3 # y3 = a # MAJA 423 424 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 425 rorx $28, a, T1 # T1 = (a >> 28) # S0 426 add frame_XFER(%rsp), h # h = k + w + h # -- 427 or c, y3 # y3 = a|c # MAJA 428 429 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 430 mov a, T1 # T1 = a # MAJB 431 and b, y3 # y3 = (a|c)&b # MAJA 432 and c, T1 # T1 = a&c # MAJB 433 add y0, y2 # y2 = S1 + CH # -- 434 435 add h, d # d = k + w + h + d # -- 436 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 437 add y1, h # h = k + w + h + S0 # -- 438 439 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 440 441 RotateState 442 443################################### RND N + 1 ######################################### 444 445 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 446 mov f, y2 # y2 = f # CH 447 rorx $41, e, y0 # y0 = e >> 41 # S1A 448 rorx $18, e, y1 # y1 = e >> 18 # S1B 449 xor g, y2 # y2 = f^g # CH 450 451 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 452 rorx $14, e, y1 # y1 = (e >> 14) # S1 453 and e, y2 # y2 = (f^g)&e # CH 454 add y3, old_h # h = t1 + S0 + MAJ # -- 455 456 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 457 rorx $34, a, T1 # T1 = a >> 34 # S0B 458 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 459 rorx $39, a, y1 # y1 = a >> 39 # S0A 460 mov a, y3 # y3 = a # MAJA 461 462 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 463 rorx $28, a, T1 # T1 = (a >> 28) # S0 464 add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- 465 or c, y3 # y3 = a|c # MAJA 466 467 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 468 mov a, T1 # T1 = a # MAJB 469 and b, y3 # y3 = (a|c)&b # MAJA 470 and c, T1 # T1 = a&c # MAJB 471 add y0, y2 # y2 = S1 + CH # -- 472 473 add h, d # d = k + w + h + d # -- 474 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 475 add y1, h # h = k + w + h + S0 # -- 476 477 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 478 479 RotateState 480 481################################### RND N + 2 ######################################### 482 483 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 484 mov f, y2 # y2 = f # CH 485 rorx $41, e, y0 # y0 = e >> 41 # S1A 486 rorx $18, e, y1 # y1 = e >> 18 # S1B 487 xor g, y2 # y2 = f^g # CH 488 489 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 490 rorx $14, e, y1 # y1 = (e >> 14) # S1 491 and e, y2 # y2 = (f^g)&e # CH 492 add y3, old_h # h = t1 + S0 + MAJ # -- 493 494 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 495 rorx $34, a, T1 # T1 = a >> 34 # S0B 496 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 497 rorx $39, a, y1 # y1 = a >> 39 # S0A 498 mov a, y3 # y3 = a # MAJA 499 500 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 501 rorx $28, a, T1 # T1 = (a >> 28) # S0 502 add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- 503 or c, y3 # y3 = a|c # MAJA 504 505 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 506 mov a, T1 # T1 = a # MAJB 507 and b, y3 # y3 = (a|c)&b # MAJA 508 and c, T1 # T1 = a&c # MAJB 509 add y0, y2 # y2 = S1 + CH # -- 510 511 add h, d # d = k + w + h + d # -- 512 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 513 add y1, h # h = k + w + h + S0 # -- 514 515 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 516 517 RotateState 518 519################################### RND N + 3 ######################################### 520 521 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 522 mov f, y2 # y2 = f # CH 523 rorx $41, e, y0 # y0 = e >> 41 # S1A 524 rorx $18, e, y1 # y1 = e >> 18 # S1B 525 xor g, y2 # y2 = f^g # CH 526 527 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 528 rorx $14, e, y1 # y1 = (e >> 14) # S1 529 and e, y2 # y2 = (f^g)&e # CH 530 add y3, old_h # h = t1 + S0 + MAJ # -- 531 532 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 533 rorx $34, a, T1 # T1 = a >> 34 # S0B 534 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 535 rorx $39, a, y1 # y1 = a >> 39 # S0A 536 mov a, y3 # y3 = a # MAJA 537 538 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 539 rorx $28, a, T1 # T1 = (a >> 28) # S0 540 add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- 541 or c, y3 # y3 = a|c # MAJA 542 543 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 544 mov a, T1 # T1 = a # MAJB 545 and b, y3 # y3 = (a|c)&b # MAJA 546 and c, T1 # T1 = a&c # MAJB 547 add y0, y2 # y2 = S1 + CH # -- 548 549 550 add h, d # d = k + w + h + d # -- 551 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 552 add y1, h # h = k + w + h + S0 # -- 553 554 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 555 556 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 557 558 add y3, h # h = t1 + S0 + MAJ # -- 559 560 RotateState 561 562.endm 563 564######################################################################## 565# void sha512_transform_rorx(void* D, const void* M, uint64_t L)# 566# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 567# The size of the message pointed to by M must be an integer multiple of SHA512 568# message blocks. 569# L is the message length in SHA512 blocks 570######################################################################## 571ENTRY(sha512_transform_rorx) 572 # Allocate Stack Space 573 mov %rsp, %rax 574 sub $frame_size, %rsp 575 and $~(0x20 - 1), %rsp 576 mov %rax, frame_RSPSAVE(%rsp) 577 578 # Save GPRs 579 mov %rbp, frame_GPRSAVE(%rsp) 580 mov %rbx, 8*1+frame_GPRSAVE(%rsp) 581 mov %r12, 8*2+frame_GPRSAVE(%rsp) 582 mov %r13, 8*3+frame_GPRSAVE(%rsp) 583 mov %r14, 8*4+frame_GPRSAVE(%rsp) 584 mov %r15, 8*5+frame_GPRSAVE(%rsp) 585 586 shl $7, NUM_BLKS # convert to bytes 587 jz done_hash 588 add INP, NUM_BLKS # pointer to end of data 589 mov NUM_BLKS, frame_INPEND(%rsp) 590 591 ## load initial digest 592 mov 8*0(CTX),a 593 mov 8*1(CTX),b 594 mov 8*2(CTX),c 595 mov 8*3(CTX),d 596 mov 8*4(CTX),e 597 mov 8*5(CTX),f 598 mov 8*6(CTX),g 599 mov 8*7(CTX),h 600 601 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 602 603loop0: 604 lea K512(%rip), TBL 605 606 ## byte swap first 16 dwords 607 COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK 608 COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK 609 COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK 610 COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK 611 612 mov INP, frame_INP(%rsp) 613 614 ## schedule 64 input dwords, by doing 12 rounds of 4 each 615 movq $4, frame_SRND(%rsp) 616 617.align 16 618loop1: 619 vpaddq (TBL), Y_0, XFER 620 vmovdqa XFER, frame_XFER(%rsp) 621 FOUR_ROUNDS_AND_SCHED 622 623 vpaddq 1*32(TBL), Y_0, XFER 624 vmovdqa XFER, frame_XFER(%rsp) 625 FOUR_ROUNDS_AND_SCHED 626 627 vpaddq 2*32(TBL), Y_0, XFER 628 vmovdqa XFER, frame_XFER(%rsp) 629 FOUR_ROUNDS_AND_SCHED 630 631 vpaddq 3*32(TBL), Y_0, XFER 632 vmovdqa XFER, frame_XFER(%rsp) 633 add $(4*32), TBL 634 FOUR_ROUNDS_AND_SCHED 635 636 subq $1, frame_SRND(%rsp) 637 jne loop1 638 639 movq $2, frame_SRND(%rsp) 640loop2: 641 vpaddq (TBL), Y_0, XFER 642 vmovdqa XFER, frame_XFER(%rsp) 643 DO_4ROUNDS 644 vpaddq 1*32(TBL), Y_1, XFER 645 vmovdqa XFER, frame_XFER(%rsp) 646 add $(2*32), TBL 647 DO_4ROUNDS 648 649 vmovdqa Y_2, Y_0 650 vmovdqa Y_3, Y_1 651 652 subq $1, frame_SRND(%rsp) 653 jne loop2 654 655 addm 8*0(CTX),a 656 addm 8*1(CTX),b 657 addm 8*2(CTX),c 658 addm 8*3(CTX),d 659 addm 8*4(CTX),e 660 addm 8*5(CTX),f 661 addm 8*6(CTX),g 662 addm 8*7(CTX),h 663 664 mov frame_INP(%rsp), INP 665 add $128, INP 666 cmp frame_INPEND(%rsp), INP 667 jne loop0 668 669done_hash: 670 671# Restore GPRs 672 mov frame_GPRSAVE(%rsp) ,%rbp 673 mov 8*1+frame_GPRSAVE(%rsp) ,%rbx 674 mov 8*2+frame_GPRSAVE(%rsp) ,%r12 675 mov 8*3+frame_GPRSAVE(%rsp) ,%r13 676 mov 8*4+frame_GPRSAVE(%rsp) ,%r14 677 mov 8*5+frame_GPRSAVE(%rsp) ,%r15 678 679 # Restore Stack Pointer 680 mov frame_RSPSAVE(%rsp), %rsp 681 ret 682ENDPROC(sha512_transform_rorx) 683 684######################################################################## 685### Binary Data 686 687 688# Mergeable 640-byte rodata section. This allows linker to merge the table 689# with other, exactly the same 640-byte fragment of another rodata section 690# (if such section exists). 691.section .rodata.cst640.K512, "aM", @progbits, 640 692.align 64 693# K[t] used in SHA512 hashing 694K512: 695 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 696 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 697 .quad 0x3956c25bf348b538,0x59f111f1b605d019 698 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 699 .quad 0xd807aa98a3030242,0x12835b0145706fbe 700 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 701 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 702 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 703 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 704 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 705 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 706 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 707 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 708 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 709 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 710 .quad 0x06ca6351e003826f,0x142929670a0e6e70 711 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 712 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 713 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 714 .quad 0x81c2c92e47edaee6,0x92722c851482353b 715 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 716 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 717 .quad 0xd192e819d6ef5218,0xd69906245565a910 718 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 719 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 720 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 721 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 722 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 723 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 724 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 725 .quad 0x90befffa23631e28,0xa4506cebde82bde9 726 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 727 .quad 0xca273eceea26619c,0xd186b8c721c0c207 728 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 729 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 730 .quad 0x113f9804bef90dae,0x1b710b35131c471b 731 .quad 0x28db77f523047d84,0x32caab7b40c72493 732 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 733 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 734 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 735 736.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 737.align 32 738# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 739PSHUFFLE_BYTE_FLIP_MASK: 740 .octa 0x08090a0b0c0d0e0f0001020304050607 741 .octa 0x18191a1b1c1d1e1f1011121314151617 742 743.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 744.align 32 745MASK_YMM_LO: 746 .octa 0x00000000000000000000000000000000 747 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 748 749#endif 750