15663535bSTim Chen######################################################################## 25663535bSTim Chen# Implement fast SHA-512 with AVX2 instructions. (x86_64) 35663535bSTim Chen# 45663535bSTim Chen# Copyright (C) 2013 Intel Corporation. 55663535bSTim Chen# 65663535bSTim Chen# Authors: 75663535bSTim Chen# James Guilford <james.guilford@intel.com> 85663535bSTim Chen# Kirk Yap <kirk.s.yap@intel.com> 95663535bSTim Chen# David Cote <david.m.cote@intel.com> 105663535bSTim Chen# Tim Chen <tim.c.chen@linux.intel.com> 115663535bSTim Chen# 125663535bSTim Chen# This software is available to you under a choice of one of two 135663535bSTim Chen# licenses. You may choose to be licensed under the terms of the GNU 145663535bSTim Chen# General Public License (GPL) Version 2, available from the file 155663535bSTim Chen# COPYING in the main directory of this source tree, or the 165663535bSTim Chen# OpenIB.org BSD license below: 175663535bSTim Chen# 185663535bSTim Chen# Redistribution and use in source and binary forms, with or 195663535bSTim Chen# without modification, are permitted provided that the following 205663535bSTim Chen# conditions are met: 215663535bSTim Chen# 225663535bSTim Chen# - Redistributions of source code must retain the above 235663535bSTim Chen# copyright notice, this list of conditions and the following 245663535bSTim Chen# disclaimer. 255663535bSTim Chen# 265663535bSTim Chen# - Redistributions in binary form must reproduce the above 275663535bSTim Chen# copyright notice, this list of conditions and the following 285663535bSTim Chen# disclaimer in the documentation and/or other materials 295663535bSTim Chen# provided with the distribution. 305663535bSTim Chen# 315663535bSTim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 325663535bSTim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 335663535bSTim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 345663535bSTim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 355663535bSTim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 365663535bSTim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 375663535bSTim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 385663535bSTim Chen# SOFTWARE. 395663535bSTim Chen# 405663535bSTim Chen######################################################################## 415663535bSTim Chen# 425663535bSTim Chen# This code is described in an Intel White-Paper: 435663535bSTim Chen# "Fast SHA-512 Implementations on Intel Architecture Processors" 445663535bSTim Chen# 455663535bSTim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded 465663535bSTim Chen# and search for that title. 475663535bSTim Chen# 485663535bSTim Chen######################################################################## 495663535bSTim Chen# This code schedules 1 blocks at a time, with 4 lanes per block 505663535bSTim Chen######################################################################## 515663535bSTim Chen 525663535bSTim Chen#ifdef CONFIG_AS_AVX2 535663535bSTim Chen#include <linux/linkage.h> 545663535bSTim Chen 555663535bSTim Chen.text 565663535bSTim Chen 575663535bSTim Chen# Virtual Registers 585663535bSTim ChenY_0 = %ymm4 595663535bSTim ChenY_1 = %ymm5 605663535bSTim ChenY_2 = %ymm6 615663535bSTim ChenY_3 = %ymm7 625663535bSTim Chen 635663535bSTim ChenYTMP0 = %ymm0 645663535bSTim ChenYTMP1 = %ymm1 655663535bSTim ChenYTMP2 = %ymm2 665663535bSTim ChenYTMP3 = %ymm3 675663535bSTim ChenYTMP4 = %ymm8 685663535bSTim ChenXFER = YTMP0 695663535bSTim Chen 705663535bSTim ChenBYTE_FLIP_MASK = %ymm9 715663535bSTim Chen 72ca04c823SJosh Poimboeuf# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 73ca04c823SJosh PoimboeufCTX1 = %rdi 74ca04c823SJosh PoimboeufCTX2 = %r12 755663535bSTim Chen# 2nd arg 76e68410ebSArd BiesheuvelINP = %rsi 775663535bSTim Chen# 3rd arg 785663535bSTim ChenNUM_BLKS = %rdx 795663535bSTim Chen 805663535bSTim Chenc = %rcx 815663535bSTim Chend = %r8 825663535bSTim Chene = %rdx 8300425bb1SArd Biesheuvely3 = %rsi 845663535bSTim Chen 85ca04c823SJosh PoimboeufTBL = %rdi # clobbers CTX1 865663535bSTim Chen 875663535bSTim Chena = %rax 885663535bSTim Chenb = %rbx 895663535bSTim Chen 905663535bSTim Chenf = %r9 915663535bSTim Cheng = %r10 925663535bSTim Chenh = %r11 935663535bSTim Chenold_h = %r11 945663535bSTim Chen 95ca04c823SJosh PoimboeufT1 = %r12 # clobbers CTX2 965663535bSTim Cheny0 = %r13 975663535bSTim Cheny1 = %r14 985663535bSTim Cheny2 = %r15 995663535bSTim Chen 1005663535bSTim Chen# Local variables (stack frame) 1015663535bSTim ChenXFER_SIZE = 4*8 1025663535bSTim ChenSRND_SIZE = 1*8 1035663535bSTim ChenINP_SIZE = 1*8 1045663535bSTim ChenINPEND_SIZE = 1*8 105ca04c823SJosh PoimboeufCTX_SIZE = 1*8 1065663535bSTim ChenRSPSAVE_SIZE = 1*8 107ca04c823SJosh PoimboeufGPRSAVE_SIZE = 5*8 1085663535bSTim Chen 1095663535bSTim Chenframe_XFER = 0 1105663535bSTim Chenframe_SRND = frame_XFER + XFER_SIZE 1115663535bSTim Chenframe_INP = frame_SRND + SRND_SIZE 1125663535bSTim Chenframe_INPEND = frame_INP + INP_SIZE 113ca04c823SJosh Poimboeufframe_CTX = frame_INPEND + INPEND_SIZE 114ca04c823SJosh Poimboeufframe_RSPSAVE = frame_CTX + CTX_SIZE 1155663535bSTim Chenframe_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE 1165663535bSTim Chenframe_size = frame_GPRSAVE + GPRSAVE_SIZE 1175663535bSTim Chen 1185663535bSTim Chen## assume buffers not aligned 1195663535bSTim Chen#define VMOVDQ vmovdqu 1205663535bSTim Chen 1215663535bSTim Chen# addm [mem], reg 1225663535bSTim Chen# Add reg to mem using reg-mem add and store 1235663535bSTim Chen.macro addm p1 p2 1245663535bSTim Chen add \p1, \p2 1255663535bSTim Chen mov \p2, \p1 1265663535bSTim Chen.endm 1275663535bSTim Chen 1285663535bSTim Chen 1295663535bSTim Chen# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask 1305663535bSTim Chen# Load ymm with mem and byte swap each dword 1315663535bSTim Chen.macro COPY_YMM_AND_BSWAP p1 p2 p3 1325663535bSTim Chen VMOVDQ \p2, \p1 1335663535bSTim Chen vpshufb \p3, \p1, \p1 1345663535bSTim Chen.endm 1355663535bSTim Chen# rotate_Ys 1365663535bSTim Chen# Rotate values of symbols Y0...Y3 1375663535bSTim Chen.macro rotate_Ys 1385663535bSTim Chen Y_ = Y_0 1395663535bSTim Chen Y_0 = Y_1 1405663535bSTim Chen Y_1 = Y_2 1415663535bSTim Chen Y_2 = Y_3 1425663535bSTim Chen Y_3 = Y_ 1435663535bSTim Chen.endm 1445663535bSTim Chen 1455663535bSTim Chen# RotateState 1465663535bSTim Chen.macro RotateState 1475663535bSTim Chen # Rotate symbols a..h right 1485663535bSTim Chen old_h = h 1495663535bSTim Chen TMP_ = h 1505663535bSTim Chen h = g 1515663535bSTim Chen g = f 1525663535bSTim Chen f = e 1535663535bSTim Chen e = d 1545663535bSTim Chen d = c 1555663535bSTim Chen c = b 1565663535bSTim Chen b = a 1575663535bSTim Chen a = TMP_ 1585663535bSTim Chen.endm 1595663535bSTim Chen 1605663535bSTim Chen# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL 1615663535bSTim Chen# YDST = {YSRC1, YSRC2} >> RVAL*8 1625663535bSTim Chen.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL 1635663535bSTim Chen vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} 1645663535bSTim Chen vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 1655663535bSTim Chen.endm 1665663535bSTim Chen 1675663535bSTim Chen.macro FOUR_ROUNDS_AND_SCHED 1685663535bSTim Chen################################### RND N + 0 ######################################### 1695663535bSTim Chen 1705663535bSTim Chen # Extract w[t-7] 1715663535bSTim Chen MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] 1725663535bSTim Chen # Calculate w[t-16] + w[t-7] 1735663535bSTim Chen vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] 1745663535bSTim Chen # Extract w[t-15] 1755663535bSTim Chen MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] 1765663535bSTim Chen 1775663535bSTim Chen # Calculate sigma0 1785663535bSTim Chen 1795663535bSTim Chen # Calculate w[t-15] ror 1 1805663535bSTim Chen vpsrlq $1, YTMP1, YTMP2 1815663535bSTim Chen vpsllq $(64-1), YTMP1, YTMP3 1825663535bSTim Chen vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 1835663535bSTim Chen # Calculate w[t-15] shr 7 1845663535bSTim Chen vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 1855663535bSTim Chen 1865663535bSTim Chen mov a, y3 # y3 = a # MAJA 1875663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 1885663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 1895663535bSTim Chen add frame_XFER(%rsp),h # h = k + w + h # -- 1905663535bSTim Chen or c, y3 # y3 = a|c # MAJA 1915663535bSTim Chen mov f, y2 # y2 = f # CH 1925663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 1935663535bSTim Chen 1945663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 1955663535bSTim Chen xor g, y2 # y2 = f^g # CH 1965663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 1975663535bSTim Chen 1985663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 1995663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 2005663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 2015663535bSTim Chen add h, d # d = k + w + h + d # -- 2025663535bSTim Chen 2035663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 2045663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 2055663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 2065663535bSTim Chen 2075663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 2085663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 2095663535bSTim Chen mov a, T1 # T1 = a # MAJB 2105663535bSTim Chen and c, T1 # T1 = a&c # MAJB 2115663535bSTim Chen 2125663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 2135663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 2145663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 2155663535bSTim Chen 2165663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 2175663535bSTim Chen 2185663535bSTim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 2195663535bSTim Chen add y3, h # h = t1 + S0 + MAJ # -- 2205663535bSTim Chen 2215663535bSTim Chen RotateState 2225663535bSTim Chen 2235663535bSTim Chen################################### RND N + 1 ######################################### 2245663535bSTim Chen 2255663535bSTim Chen # Calculate w[t-15] ror 8 2265663535bSTim Chen vpsrlq $8, YTMP1, YTMP2 2275663535bSTim Chen vpsllq $(64-8), YTMP1, YTMP1 2285663535bSTim Chen vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 2295663535bSTim Chen # XOR the three components 2305663535bSTim Chen vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 2315663535bSTim Chen vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 2325663535bSTim Chen 2335663535bSTim Chen 2345663535bSTim Chen # Add three components, w[t-16], w[t-7] and sigma0 2355663535bSTim Chen vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 2365663535bSTim Chen # Move to appropriate lanes for calculating w[16] and w[17] 2375663535bSTim Chen vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} 2385663535bSTim Chen # Move to appropriate lanes for calculating w[18] and w[19] 2395663535bSTim Chen vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} 2405663535bSTim Chen 2415663535bSTim Chen # Calculate w[16] and w[17] in both 128 bit lanes 2425663535bSTim Chen 2435663535bSTim Chen # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes 2445663535bSTim Chen vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} 2455663535bSTim Chen vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} 2465663535bSTim Chen 2475663535bSTim Chen 2485663535bSTim Chen mov a, y3 # y3 = a # MAJA 2495663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 2505663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 2515663535bSTim Chen add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- 2525663535bSTim Chen or c, y3 # y3 = a|c # MAJA 2535663535bSTim Chen 2545663535bSTim Chen 2555663535bSTim Chen mov f, y2 # y2 = f # CH 2565663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 2575663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 2585663535bSTim Chen xor g, y2 # y2 = f^g # CH 2595663535bSTim Chen 2605663535bSTim Chen 2615663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 2625663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 2635663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 2645663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 2655663535bSTim Chen add h, d # d = k + w + h + d # -- 2665663535bSTim Chen 2675663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 2685663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 2695663535bSTim Chen 2705663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 2715663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 2725663535bSTim Chen 2735663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 2745663535bSTim Chen mov a, T1 # T1 = a # MAJB 2755663535bSTim Chen and c, T1 # T1 = a&c # MAJB 2765663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 2775663535bSTim Chen 2785663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 2795663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 2805663535bSTim Chen 2815663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 2825663535bSTim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 2835663535bSTim Chen add y3, h # h = t1 + S0 + MAJ # -- 2845663535bSTim Chen 2855663535bSTim Chen RotateState 2865663535bSTim Chen 2875663535bSTim Chen 2885663535bSTim Chen################################### RND N + 2 ######################################### 2895663535bSTim Chen 2905663535bSTim Chen vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} 2915663535bSTim Chen vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} 2925663535bSTim Chen vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} 2935663535bSTim Chen vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} 2945663535bSTim Chen vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} 2955663535bSTim Chen vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} 2965663535bSTim Chen vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} 2975663535bSTim Chen vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 2985663535bSTim Chen # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} 2995663535bSTim Chen 3005663535bSTim Chen # Add sigma1 to the other compunents to get w[16] and w[17] 3015663535bSTim Chen vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} 3025663535bSTim Chen 3035663535bSTim Chen # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane 3045663535bSTim Chen vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} 3055663535bSTim Chen 3065663535bSTim Chen mov a, y3 # y3 = a # MAJA 3075663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 3085663535bSTim Chen add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- 3095663535bSTim Chen 3105663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 3115663535bSTim Chen or c, y3 # y3 = a|c # MAJA 3125663535bSTim Chen mov f, y2 # y2 = f # CH 3135663535bSTim Chen xor g, y2 # y2 = f^g # CH 3145663535bSTim Chen 3155663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 3165663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 3175663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 3185663535bSTim Chen 3195663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 3205663535bSTim Chen add h, d # d = k + w + h + d # -- 3215663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 3225663535bSTim Chen 3235663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 3245663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 3255663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 3265663535bSTim Chen 3275663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 3285663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 3295663535bSTim Chen 3305663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 3315663535bSTim Chen mov a, T1 # T1 = a # MAJB 3325663535bSTim Chen and c, T1 # T1 = a&c # MAJB 3335663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 3345663535bSTim Chen 3355663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 3365663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 3375663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 3385663535bSTim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 3395663535bSTim Chen 3405663535bSTim Chen add y3, h # h = t1 + S0 + MAJ # -- 3415663535bSTim Chen 3425663535bSTim Chen RotateState 3435663535bSTim Chen 3445663535bSTim Chen################################### RND N + 3 ######################################### 3455663535bSTim Chen 3465663535bSTim Chen vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} 3475663535bSTim Chen vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} 3485663535bSTim Chen vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} 3495663535bSTim Chen vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} 3505663535bSTim Chen vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} 3515663535bSTim Chen vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} 3525663535bSTim Chen vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} 3535663535bSTim Chen vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 3545663535bSTim Chen # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} 3555663535bSTim Chen 3565663535bSTim Chen # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] 3575663535bSTim Chen # to newly calculated sigma1 to get w[18] and w[19] 3585663535bSTim Chen vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} 3595663535bSTim Chen 3605663535bSTim Chen # Form w[19, w[18], w17], w[16] 3615663535bSTim Chen vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} 3625663535bSTim Chen 3635663535bSTim Chen mov a, y3 # y3 = a # MAJA 3645663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 3655663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 3665663535bSTim Chen add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- 3675663535bSTim Chen or c, y3 # y3 = a|c # MAJA 3685663535bSTim Chen 3695663535bSTim Chen 3705663535bSTim Chen mov f, y2 # y2 = f # CH 3715663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 3725663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 3735663535bSTim Chen xor g, y2 # y2 = f^g # CH 3745663535bSTim Chen 3755663535bSTim Chen 3765663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 3775663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 3785663535bSTim Chen add h, d # d = k + w + h + d # -- 3795663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 3805663535bSTim Chen 3815663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 3825663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 3835663535bSTim Chen 3845663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 3855663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 3865663535bSTim Chen 3875663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 3885663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 3895663535bSTim Chen 3905663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 3915663535bSTim Chen 3925663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 3935663535bSTim Chen mov a, T1 # T1 = a # MAJB 3945663535bSTim Chen and c, T1 # T1 = a&c # MAJB 3955663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 3965663535bSTim Chen 3975663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 3985663535bSTim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 3995663535bSTim Chen add y3, h # h = t1 + S0 + MAJ # -- 4005663535bSTim Chen 4015663535bSTim Chen RotateState 4025663535bSTim Chen 4035663535bSTim Chen rotate_Ys 4045663535bSTim Chen.endm 4055663535bSTim Chen 4065663535bSTim Chen.macro DO_4ROUNDS 4075663535bSTim Chen 4085663535bSTim Chen################################### RND N + 0 ######################################### 4095663535bSTim Chen 4105663535bSTim Chen mov f, y2 # y2 = f # CH 4115663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 4125663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 4135663535bSTim Chen xor g, y2 # y2 = f^g # CH 4145663535bSTim Chen 4155663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 4165663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 4175663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 4185663535bSTim Chen 4195663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 4205663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 4215663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 4225663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 4235663535bSTim Chen mov a, y3 # y3 = a # MAJA 4245663535bSTim Chen 4255663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 4265663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 4275663535bSTim Chen add frame_XFER(%rsp), h # h = k + w + h # -- 4285663535bSTim Chen or c, y3 # y3 = a|c # MAJA 4295663535bSTim Chen 4305663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 4315663535bSTim Chen mov a, T1 # T1 = a # MAJB 4325663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 4335663535bSTim Chen and c, T1 # T1 = a&c # MAJB 4345663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 4355663535bSTim Chen 4365663535bSTim Chen add h, d # d = k + w + h + d # -- 4375663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 4385663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 4395663535bSTim Chen 4405663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 4415663535bSTim Chen 4425663535bSTim Chen RotateState 4435663535bSTim Chen 4445663535bSTim Chen################################### RND N + 1 ######################################### 4455663535bSTim Chen 4465663535bSTim Chen add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 4475663535bSTim Chen mov f, y2 # y2 = f # CH 4485663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 4495663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 4505663535bSTim Chen xor g, y2 # y2 = f^g # CH 4515663535bSTim Chen 4525663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 4535663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 4545663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 4555663535bSTim Chen add y3, old_h # h = t1 + S0 + MAJ # -- 4565663535bSTim Chen 4575663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 4585663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 4595663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 4605663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 4615663535bSTim Chen mov a, y3 # y3 = a # MAJA 4625663535bSTim Chen 4635663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 4645663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 4655663535bSTim Chen add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- 4665663535bSTim Chen or c, y3 # y3 = a|c # MAJA 4675663535bSTim Chen 4685663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 4695663535bSTim Chen mov a, T1 # T1 = a # MAJB 4705663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 4715663535bSTim Chen and c, T1 # T1 = a&c # MAJB 4725663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 4735663535bSTim Chen 4745663535bSTim Chen add h, d # d = k + w + h + d # -- 4755663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 4765663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 4775663535bSTim Chen 4785663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 4795663535bSTim Chen 4805663535bSTim Chen RotateState 4815663535bSTim Chen 4825663535bSTim Chen################################### RND N + 2 ######################################### 4835663535bSTim Chen 4845663535bSTim Chen add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 4855663535bSTim Chen mov f, y2 # y2 = f # CH 4865663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 4875663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 4885663535bSTim Chen xor g, y2 # y2 = f^g # CH 4895663535bSTim Chen 4905663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 4915663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 4925663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 4935663535bSTim Chen add y3, old_h # h = t1 + S0 + MAJ # -- 4945663535bSTim Chen 4955663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 4965663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 4975663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 4985663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 4995663535bSTim Chen mov a, y3 # y3 = a # MAJA 5005663535bSTim Chen 5015663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 5025663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 5035663535bSTim Chen add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- 5045663535bSTim Chen or c, y3 # y3 = a|c # MAJA 5055663535bSTim Chen 5065663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 5075663535bSTim Chen mov a, T1 # T1 = a # MAJB 5085663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 5095663535bSTim Chen and c, T1 # T1 = a&c # MAJB 5105663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 5115663535bSTim Chen 5125663535bSTim Chen add h, d # d = k + w + h + d # -- 5135663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 5145663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 5155663535bSTim Chen 5165663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 5175663535bSTim Chen 5185663535bSTim Chen RotateState 5195663535bSTim Chen 5205663535bSTim Chen################################### RND N + 3 ######################################### 5215663535bSTim Chen 5225663535bSTim Chen add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 5235663535bSTim Chen mov f, y2 # y2 = f # CH 5245663535bSTim Chen rorx $41, e, y0 # y0 = e >> 41 # S1A 5255663535bSTim Chen rorx $18, e, y1 # y1 = e >> 18 # S1B 5265663535bSTim Chen xor g, y2 # y2 = f^g # CH 5275663535bSTim Chen 5285663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 5295663535bSTim Chen rorx $14, e, y1 # y1 = (e >> 14) # S1 5305663535bSTim Chen and e, y2 # y2 = (f^g)&e # CH 5315663535bSTim Chen add y3, old_h # h = t1 + S0 + MAJ # -- 5325663535bSTim Chen 5335663535bSTim Chen xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 5345663535bSTim Chen rorx $34, a, T1 # T1 = a >> 34 # S0B 5355663535bSTim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 5365663535bSTim Chen rorx $39, a, y1 # y1 = a >> 39 # S0A 5375663535bSTim Chen mov a, y3 # y3 = a # MAJA 5385663535bSTim Chen 5395663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 5405663535bSTim Chen rorx $28, a, T1 # T1 = (a >> 28) # S0 5415663535bSTim Chen add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- 5425663535bSTim Chen or c, y3 # y3 = a|c # MAJA 5435663535bSTim Chen 5445663535bSTim Chen xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 5455663535bSTim Chen mov a, T1 # T1 = a # MAJB 5465663535bSTim Chen and b, y3 # y3 = (a|c)&b # MAJA 5475663535bSTim Chen and c, T1 # T1 = a&c # MAJB 5485663535bSTim Chen add y0, y2 # y2 = S1 + CH # -- 5495663535bSTim Chen 5505663535bSTim Chen 5515663535bSTim Chen add h, d # d = k + w + h + d # -- 5525663535bSTim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 5535663535bSTim Chen add y1, h # h = k + w + h + S0 # -- 5545663535bSTim Chen 5555663535bSTim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 5565663535bSTim Chen 5575663535bSTim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 5585663535bSTim Chen 5595663535bSTim Chen add y3, h # h = t1 + S0 + MAJ # -- 5605663535bSTim Chen 5615663535bSTim Chen RotateState 5625663535bSTim Chen 5635663535bSTim Chen.endm 5645663535bSTim Chen 5655663535bSTim Chen######################################################################## 566e68410ebSArd Biesheuvel# void sha512_transform_rorx(void* D, const void* M, uint64_t L)# 5675663535bSTim Chen# Purpose: Updates the SHA512 digest stored at D with the message stored in M. 5685663535bSTim Chen# The size of the message pointed to by M must be an integer multiple of SHA512 5695663535bSTim Chen# message blocks. 5705663535bSTim Chen# L is the message length in SHA512 blocks 5715663535bSTim Chen######################################################################## 572*6dcc5627SJiri SlabySYM_FUNC_START(sha512_transform_rorx) 5735663535bSTim Chen # Allocate Stack Space 5745663535bSTim Chen mov %rsp, %rax 5755663535bSTim Chen sub $frame_size, %rsp 5765663535bSTim Chen and $~(0x20 - 1), %rsp 5775663535bSTim Chen mov %rax, frame_RSPSAVE(%rsp) 5785663535bSTim Chen 5795663535bSTim Chen # Save GPRs 580ca04c823SJosh Poimboeuf mov %rbx, 8*0+frame_GPRSAVE(%rsp) 581ca04c823SJosh Poimboeuf mov %r12, 8*1+frame_GPRSAVE(%rsp) 582ca04c823SJosh Poimboeuf mov %r13, 8*2+frame_GPRSAVE(%rsp) 583ca04c823SJosh Poimboeuf mov %r14, 8*3+frame_GPRSAVE(%rsp) 584ca04c823SJosh Poimboeuf mov %r15, 8*4+frame_GPRSAVE(%rsp) 5855663535bSTim Chen 5865663535bSTim Chen shl $7, NUM_BLKS # convert to bytes 5875663535bSTim Chen jz done_hash 5885663535bSTim Chen add INP, NUM_BLKS # pointer to end of data 5895663535bSTim Chen mov NUM_BLKS, frame_INPEND(%rsp) 5905663535bSTim Chen 5915663535bSTim Chen ## load initial digest 592ca04c823SJosh Poimboeuf mov 8*0(CTX1), a 593ca04c823SJosh Poimboeuf mov 8*1(CTX1), b 594ca04c823SJosh Poimboeuf mov 8*2(CTX1), c 595ca04c823SJosh Poimboeuf mov 8*3(CTX1), d 596ca04c823SJosh Poimboeuf mov 8*4(CTX1), e 597ca04c823SJosh Poimboeuf mov 8*5(CTX1), f 598ca04c823SJosh Poimboeuf mov 8*6(CTX1), g 599ca04c823SJosh Poimboeuf mov 8*7(CTX1), h 600ca04c823SJosh Poimboeuf 601ca04c823SJosh Poimboeuf # save %rdi (CTX) before it gets clobbered 602ca04c823SJosh Poimboeuf mov %rdi, frame_CTX(%rsp) 6035663535bSTim Chen 6045663535bSTim Chen vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 6055663535bSTim Chen 6065663535bSTim Chenloop0: 6075663535bSTim Chen lea K512(%rip), TBL 6085663535bSTim Chen 6095663535bSTim Chen ## byte swap first 16 dwords 6105663535bSTim Chen COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK 6115663535bSTim Chen COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK 6125663535bSTim Chen COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK 6135663535bSTim Chen COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK 6145663535bSTim Chen 6155663535bSTim Chen mov INP, frame_INP(%rsp) 6165663535bSTim Chen 6175663535bSTim Chen ## schedule 64 input dwords, by doing 12 rounds of 4 each 6185663535bSTim Chen movq $4, frame_SRND(%rsp) 6195663535bSTim Chen 6205663535bSTim Chen.align 16 6215663535bSTim Chenloop1: 6225663535bSTim Chen vpaddq (TBL), Y_0, XFER 6235663535bSTim Chen vmovdqa XFER, frame_XFER(%rsp) 6245663535bSTim Chen FOUR_ROUNDS_AND_SCHED 6255663535bSTim Chen 6265663535bSTim Chen vpaddq 1*32(TBL), Y_0, XFER 6275663535bSTim Chen vmovdqa XFER, frame_XFER(%rsp) 6285663535bSTim Chen FOUR_ROUNDS_AND_SCHED 6295663535bSTim Chen 6305663535bSTim Chen vpaddq 2*32(TBL), Y_0, XFER 6315663535bSTim Chen vmovdqa XFER, frame_XFER(%rsp) 6325663535bSTim Chen FOUR_ROUNDS_AND_SCHED 6335663535bSTim Chen 6345663535bSTim Chen vpaddq 3*32(TBL), Y_0, XFER 6355663535bSTim Chen vmovdqa XFER, frame_XFER(%rsp) 6365663535bSTim Chen add $(4*32), TBL 6375663535bSTim Chen FOUR_ROUNDS_AND_SCHED 6385663535bSTim Chen 6395663535bSTim Chen subq $1, frame_SRND(%rsp) 6405663535bSTim Chen jne loop1 6415663535bSTim Chen 6425663535bSTim Chen movq $2, frame_SRND(%rsp) 6435663535bSTim Chenloop2: 6445663535bSTim Chen vpaddq (TBL), Y_0, XFER 6455663535bSTim Chen vmovdqa XFER, frame_XFER(%rsp) 6465663535bSTim Chen DO_4ROUNDS 6475663535bSTim Chen vpaddq 1*32(TBL), Y_1, XFER 6485663535bSTim Chen vmovdqa XFER, frame_XFER(%rsp) 6495663535bSTim Chen add $(2*32), TBL 6505663535bSTim Chen DO_4ROUNDS 6515663535bSTim Chen 6525663535bSTim Chen vmovdqa Y_2, Y_0 6535663535bSTim Chen vmovdqa Y_3, Y_1 6545663535bSTim Chen 6555663535bSTim Chen subq $1, frame_SRND(%rsp) 6565663535bSTim Chen jne loop2 6575663535bSTim Chen 658ca04c823SJosh Poimboeuf mov frame_CTX(%rsp), CTX2 659ca04c823SJosh Poimboeuf addm 8*0(CTX2), a 660ca04c823SJosh Poimboeuf addm 8*1(CTX2), b 661ca04c823SJosh Poimboeuf addm 8*2(CTX2), c 662ca04c823SJosh Poimboeuf addm 8*3(CTX2), d 663ca04c823SJosh Poimboeuf addm 8*4(CTX2), e 664ca04c823SJosh Poimboeuf addm 8*5(CTX2), f 665ca04c823SJosh Poimboeuf addm 8*6(CTX2), g 666ca04c823SJosh Poimboeuf addm 8*7(CTX2), h 6675663535bSTim Chen 6685663535bSTim Chen mov frame_INP(%rsp), INP 6695663535bSTim Chen add $128, INP 6705663535bSTim Chen cmp frame_INPEND(%rsp), INP 6715663535bSTim Chen jne loop0 6725663535bSTim Chen 6735663535bSTim Chendone_hash: 6745663535bSTim Chen 6755663535bSTim Chen# Restore GPRs 676ca04c823SJosh Poimboeuf mov 8*0+frame_GPRSAVE(%rsp), %rbx 677ca04c823SJosh Poimboeuf mov 8*1+frame_GPRSAVE(%rsp), %r12 678ca04c823SJosh Poimboeuf mov 8*2+frame_GPRSAVE(%rsp), %r13 679ca04c823SJosh Poimboeuf mov 8*3+frame_GPRSAVE(%rsp), %r14 680ca04c823SJosh Poimboeuf mov 8*4+frame_GPRSAVE(%rsp), %r15 6815663535bSTim Chen 6825663535bSTim Chen # Restore Stack Pointer 6835663535bSTim Chen mov frame_RSPSAVE(%rsp), %rsp 6845663535bSTim Chen ret 685*6dcc5627SJiri SlabySYM_FUNC_END(sha512_transform_rorx) 6865663535bSTim Chen 6875663535bSTim Chen######################################################################## 6885663535bSTim Chen### Binary Data 6895663535bSTim Chen 6905663535bSTim Chen 691e183914aSDenys Vlasenko# Mergeable 640-byte rodata section. This allows linker to merge the table 692e183914aSDenys Vlasenko# with other, exactly the same 640-byte fragment of another rodata section 693e183914aSDenys Vlasenko# (if such section exists). 694e183914aSDenys Vlasenko.section .rodata.cst640.K512, "aM", @progbits, 640 6955663535bSTim Chen.align 64 6965663535bSTim Chen# K[t] used in SHA512 hashing 6975663535bSTim ChenK512: 6985663535bSTim Chen .quad 0x428a2f98d728ae22,0x7137449123ef65cd 6995663535bSTim Chen .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 7005663535bSTim Chen .quad 0x3956c25bf348b538,0x59f111f1b605d019 7015663535bSTim Chen .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 7025663535bSTim Chen .quad 0xd807aa98a3030242,0x12835b0145706fbe 7035663535bSTim Chen .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 7045663535bSTim Chen .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 7055663535bSTim Chen .quad 0x9bdc06a725c71235,0xc19bf174cf692694 7065663535bSTim Chen .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 7075663535bSTim Chen .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 7085663535bSTim Chen .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 7095663535bSTim Chen .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 7105663535bSTim Chen .quad 0x983e5152ee66dfab,0xa831c66d2db43210 7115663535bSTim Chen .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 7125663535bSTim Chen .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 7135663535bSTim Chen .quad 0x06ca6351e003826f,0x142929670a0e6e70 7145663535bSTim Chen .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 7155663535bSTim Chen .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 7165663535bSTim Chen .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 7175663535bSTim Chen .quad 0x81c2c92e47edaee6,0x92722c851482353b 7185663535bSTim Chen .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 7195663535bSTim Chen .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 7205663535bSTim Chen .quad 0xd192e819d6ef5218,0xd69906245565a910 7215663535bSTim Chen .quad 0xf40e35855771202a,0x106aa07032bbd1b8 7225663535bSTim Chen .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 7235663535bSTim Chen .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 7245663535bSTim Chen .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 7255663535bSTim Chen .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 7265663535bSTim Chen .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 7275663535bSTim Chen .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 7285663535bSTim Chen .quad 0x90befffa23631e28,0xa4506cebde82bde9 7295663535bSTim Chen .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 7305663535bSTim Chen .quad 0xca273eceea26619c,0xd186b8c721c0c207 7315663535bSTim Chen .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 7325663535bSTim Chen .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 7335663535bSTim Chen .quad 0x113f9804bef90dae,0x1b710b35131c471b 7345663535bSTim Chen .quad 0x28db77f523047d84,0x32caab7b40c72493 7355663535bSTim Chen .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 7365663535bSTim Chen .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 7375663535bSTim Chen .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 7385663535bSTim Chen 739e183914aSDenys Vlasenko.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 7405663535bSTim Chen.align 32 7415663535bSTim Chen# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 7425663535bSTim ChenPSHUFFLE_BYTE_FLIP_MASK: 7435663535bSTim Chen .octa 0x08090a0b0c0d0e0f0001020304050607 7445663535bSTim Chen .octa 0x18191a1b1c1d1e1f1011121314151617 7455663535bSTim Chen 746e183914aSDenys Vlasenko.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 747e183914aSDenys Vlasenko.align 32 7485663535bSTim ChenMASK_YMM_LO: 7495663535bSTim Chen .octa 0x00000000000000000000000000000000 7505663535bSTim Chen .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 751e183914aSDenys Vlasenko 7525663535bSTim Chen#endif 753