1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123#include <asm/inst.h> 124 125.data 126.align 16 127 128POLY: .octa 0xC2000000000000000000000000000001 129POLY2: .octa 0xC20000000000000000000001C2000000 130TWOONE: .octa 0x00000001000000000000000000000001 131 132# order of these constants should not change. 133# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F 134 135SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 136SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 137ALL_F: .octa 0xffffffffffffffffffffffffffffffff 138ZERO: .octa 0x00000000000000000000000000000000 139ONE: .octa 0x00000000000000000000000000000001 140ONEf: .octa 0x01000000000000000000000000000000 141 142.text 143 144 145##define the fields of the gcm aes context 146#{ 147# u8 expanded_keys[16*11] store expanded keys 148# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here 149# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here 150# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here 151# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here 152# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here 153# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here 154# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here 155# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here 156# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) 157# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) 158# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) 159# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) 160# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) 161# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) 162# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) 163# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) 164#} gcm_ctx# 165 166HashKey = 16*11 # store HashKey <<1 mod poly here 167HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here 168HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here 169HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here 170HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here 171HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here 172HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here 173HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here 174HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 175HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 176HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 177HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 178HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 179HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 180HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 181HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 182 183#define arg1 %rdi 184#define arg2 %rsi 185#define arg3 %rdx 186#define arg4 %rcx 187#define arg5 %r8 188#define arg6 %r9 189#define arg7 STACK_OFFSET+8*1(%r14) 190#define arg8 STACK_OFFSET+8*2(%r14) 191#define arg9 STACK_OFFSET+8*3(%r14) 192 193i = 0 194j = 0 195 196out_order = 0 197in_order = 1 198DEC = 0 199ENC = 1 200 201.macro define_reg r n 202reg_\r = %xmm\n 203.endm 204 205.macro setreg 206.altmacro 207define_reg i %i 208define_reg j %j 209.noaltmacro 210.endm 211 212# need to push 4 registers into stack to maintain 213STACK_OFFSET = 8*4 214 215TMP1 = 16*0 # Temporary storage for AAD 216TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 217TMP3 = 16*2 # Temporary storage for AES State 3 218TMP4 = 16*3 # Temporary storage for AES State 4 219TMP5 = 16*4 # Temporary storage for AES State 5 220TMP6 = 16*5 # Temporary storage for AES State 6 221TMP7 = 16*6 # Temporary storage for AES State 7 222TMP8 = 16*7 # Temporary storage for AES State 8 223 224VARIABLE_OFFSET = 16*8 225 226################################ 227# Utility Macros 228################################ 229 230# Encryption of a single block 231.macro ENCRYPT_SINGLE_BLOCK XMM0 232 vpxor (arg1), \XMM0, \XMM0 233 i = 1 234 setreg 235.rep 9 236 vaesenc 16*i(arg1), \XMM0, \XMM0 237 i = (i+1) 238 setreg 239.endr 240 vaesenclast 16*10(arg1), \XMM0, \XMM0 241.endm 242 243#ifdef CONFIG_AS_AVX 244############################################################################### 245# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 246# Input: A and B (128-bits each, bit-reflected) 247# Output: C = A*B*x mod poly, (i.e. >>1 ) 248# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 249# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 250############################################################################### 251.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 252 253 vpshufd $0b01001110, \GH, \T2 254 vpshufd $0b01001110, \HK, \T3 255 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 256 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 257 258 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 259 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 260 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 261 vpxor \GH, \T2,\T2 262 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 263 264 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 265 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 266 vpxor \T3, \GH, \GH 267 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 268 269 #first phase of the reduction 270 vpslld $31, \GH, \T2 # packed right shifting << 31 271 vpslld $30, \GH, \T3 # packed right shifting shift << 30 272 vpslld $25, \GH, \T4 # packed right shifting shift << 25 273 274 vpxor \T3, \T2, \T2 # xor the shifted versions 275 vpxor \T4, \T2, \T2 276 277 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 278 279 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 280 vpxor \T2, \GH, \GH # first phase of the reduction complete 281 282 #second phase of the reduction 283 284 vpsrld $1,\GH, \T2 # packed left shifting >> 1 285 vpsrld $2,\GH, \T3 # packed left shifting >> 2 286 vpsrld $7,\GH, \T4 # packed left shifting >> 7 287 vpxor \T3, \T2, \T2 # xor the shifted versions 288 vpxor \T4, \T2, \T2 289 290 vpxor \T5, \T2, \T2 291 vpxor \T2, \GH, \GH 292 vpxor \T1, \GH, \GH # the result is in GH 293 294 295.endm 296 297.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 298 299 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 300 vmovdqa \HK, \T5 301 302 vpshufd $0b01001110, \T5, \T1 303 vpxor \T5, \T1, \T1 304 vmovdqa \T1, HashKey_k(arg1) 305 306 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 307 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly 308 vpshufd $0b01001110, \T5, \T1 309 vpxor \T5, \T1, \T1 310 vmovdqa \T1, HashKey_2_k(arg1) 311 312 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 313 vmovdqa \T5, HashKey_3(arg1) 314 vpshufd $0b01001110, \T5, \T1 315 vpxor \T5, \T1, \T1 316 vmovdqa \T1, HashKey_3_k(arg1) 317 318 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 319 vmovdqa \T5, HashKey_4(arg1) 320 vpshufd $0b01001110, \T5, \T1 321 vpxor \T5, \T1, \T1 322 vmovdqa \T1, HashKey_4_k(arg1) 323 324 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 325 vmovdqa \T5, HashKey_5(arg1) 326 vpshufd $0b01001110, \T5, \T1 327 vpxor \T5, \T1, \T1 328 vmovdqa \T1, HashKey_5_k(arg1) 329 330 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 331 vmovdqa \T5, HashKey_6(arg1) 332 vpshufd $0b01001110, \T5, \T1 333 vpxor \T5, \T1, \T1 334 vmovdqa \T1, HashKey_6_k(arg1) 335 336 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 337 vmovdqa \T5, HashKey_7(arg1) 338 vpshufd $0b01001110, \T5, \T1 339 vpxor \T5, \T1, \T1 340 vmovdqa \T1, HashKey_7_k(arg1) 341 342 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 343 vmovdqa \T5, HashKey_8(arg1) 344 vpshufd $0b01001110, \T5, \T1 345 vpxor \T5, \T1, \T1 346 vmovdqa \T1, HashKey_8_k(arg1) 347 348.endm 349 350## if a = number of total plaintext bytes 351## b = floor(a/16) 352## num_initial_blocks = b mod 4# 353## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 354## r10, r11, r12, rax are clobbered 355## arg1, arg2, arg3, r14 are used as a pointer only, not modified 356 357.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 358 i = (8-\num_initial_blocks) 359 setreg 360 361 mov arg6, %r10 # r10 = AAD 362 mov arg7, %r12 # r12 = aadLen 363 364 365 mov %r12, %r11 366 367 vpxor reg_i, reg_i, reg_i 368_get_AAD_loop\@: 369 vmovd (%r10), \T1 370 vpslldq $12, \T1, \T1 371 vpsrldq $4, reg_i, reg_i 372 vpxor \T1, reg_i, reg_i 373 374 add $4, %r10 375 sub $4, %r12 376 jg _get_AAD_loop\@ 377 378 379 cmp $16, %r11 380 je _get_AAD_loop2_done\@ 381 mov $16, %r12 382 383_get_AAD_loop2\@: 384 vpsrldq $4, reg_i, reg_i 385 sub $4, %r12 386 cmp %r11, %r12 387 jg _get_AAD_loop2\@ 388 389_get_AAD_loop2_done\@: 390 391 #byte-reflect the AAD data 392 vpshufb SHUF_MASK(%rip), reg_i, reg_i 393 394 # initialize the data pointer offset as zero 395 xor %r11, %r11 396 397 # start AES for num_initial_blocks blocks 398 mov arg5, %rax # rax = *Y0 399 vmovdqu (%rax), \CTR # CTR = Y0 400 vpshufb SHUF_MASK(%rip), \CTR, \CTR 401 402 403 i = (9-\num_initial_blocks) 404 setreg 405.rep \num_initial_blocks 406 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 407 vmovdqa \CTR, reg_i 408 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 409 i = (i+1) 410 setreg 411.endr 412 413 vmovdqa (arg1), \T_key 414 i = (9-\num_initial_blocks) 415 setreg 416.rep \num_initial_blocks 417 vpxor \T_key, reg_i, reg_i 418 i = (i+1) 419 setreg 420.endr 421 422 j = 1 423 setreg 424.rep 9 425 vmovdqa 16*j(arg1), \T_key 426 i = (9-\num_initial_blocks) 427 setreg 428.rep \num_initial_blocks 429 vaesenc \T_key, reg_i, reg_i 430 i = (i+1) 431 setreg 432.endr 433 434 j = (j+1) 435 setreg 436.endr 437 438 439 vmovdqa 16*10(arg1), \T_key 440 i = (9-\num_initial_blocks) 441 setreg 442.rep \num_initial_blocks 443 vaesenclast \T_key, reg_i, reg_i 444 i = (i+1) 445 setreg 446.endr 447 448 i = (9-\num_initial_blocks) 449 setreg 450.rep \num_initial_blocks 451 vmovdqu (arg3, %r11), \T1 452 vpxor \T1, reg_i, reg_i 453 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks 454 add $16, %r11 455.if \ENC_DEC == DEC 456 vmovdqa \T1, reg_i 457.endif 458 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 459 i = (i+1) 460 setreg 461.endr 462 463 464 i = (8-\num_initial_blocks) 465 j = (9-\num_initial_blocks) 466 setreg 467 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 468 469.rep \num_initial_blocks 470 vpxor reg_i, reg_j, reg_j 471 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 472 i = (i+1) 473 j = (j+1) 474 setreg 475.endr 476 # XMM8 has the combined result here 477 478 vmovdqa \XMM8, TMP1(%rsp) 479 vmovdqa \XMM8, \T3 480 481 cmp $128, %r13 482 jl _initial_blocks_done\@ # no need for precomputed constants 483 484############################################################################### 485# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 486 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 487 vmovdqa \CTR, \XMM1 488 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 489 490 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 491 vmovdqa \CTR, \XMM2 492 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 493 494 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 495 vmovdqa \CTR, \XMM3 496 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 497 498 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 499 vmovdqa \CTR, \XMM4 500 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 501 502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 503 vmovdqa \CTR, \XMM5 504 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 505 506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 507 vmovdqa \CTR, \XMM6 508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 509 510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 511 vmovdqa \CTR, \XMM7 512 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 513 514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 515 vmovdqa \CTR, \XMM8 516 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 517 518 vmovdqa (arg1), \T_key 519 vpxor \T_key, \XMM1, \XMM1 520 vpxor \T_key, \XMM2, \XMM2 521 vpxor \T_key, \XMM3, \XMM3 522 vpxor \T_key, \XMM4, \XMM4 523 vpxor \T_key, \XMM5, \XMM5 524 vpxor \T_key, \XMM6, \XMM6 525 vpxor \T_key, \XMM7, \XMM7 526 vpxor \T_key, \XMM8, \XMM8 527 528 i = 1 529 setreg 530.rep 9 # do 9 rounds 531 vmovdqa 16*i(arg1), \T_key 532 vaesenc \T_key, \XMM1, \XMM1 533 vaesenc \T_key, \XMM2, \XMM2 534 vaesenc \T_key, \XMM3, \XMM3 535 vaesenc \T_key, \XMM4, \XMM4 536 vaesenc \T_key, \XMM5, \XMM5 537 vaesenc \T_key, \XMM6, \XMM6 538 vaesenc \T_key, \XMM7, \XMM7 539 vaesenc \T_key, \XMM8, \XMM8 540 i = (i+1) 541 setreg 542.endr 543 544 545 vmovdqa 16*i(arg1), \T_key 546 vaesenclast \T_key, \XMM1, \XMM1 547 vaesenclast \T_key, \XMM2, \XMM2 548 vaesenclast \T_key, \XMM3, \XMM3 549 vaesenclast \T_key, \XMM4, \XMM4 550 vaesenclast \T_key, \XMM5, \XMM5 551 vaesenclast \T_key, \XMM6, \XMM6 552 vaesenclast \T_key, \XMM7, \XMM7 553 vaesenclast \T_key, \XMM8, \XMM8 554 555 vmovdqu (arg3, %r11), \T1 556 vpxor \T1, \XMM1, \XMM1 557 vmovdqu \XMM1, (arg2 , %r11) 558 .if \ENC_DEC == DEC 559 vmovdqa \T1, \XMM1 560 .endif 561 562 vmovdqu 16*1(arg3, %r11), \T1 563 vpxor \T1, \XMM2, \XMM2 564 vmovdqu \XMM2, 16*1(arg2 , %r11) 565 .if \ENC_DEC == DEC 566 vmovdqa \T1, \XMM2 567 .endif 568 569 vmovdqu 16*2(arg3, %r11), \T1 570 vpxor \T1, \XMM3, \XMM3 571 vmovdqu \XMM3, 16*2(arg2 , %r11) 572 .if \ENC_DEC == DEC 573 vmovdqa \T1, \XMM3 574 .endif 575 576 vmovdqu 16*3(arg3, %r11), \T1 577 vpxor \T1, \XMM4, \XMM4 578 vmovdqu \XMM4, 16*3(arg2 , %r11) 579 .if \ENC_DEC == DEC 580 vmovdqa \T1, \XMM4 581 .endif 582 583 vmovdqu 16*4(arg3, %r11), \T1 584 vpxor \T1, \XMM5, \XMM5 585 vmovdqu \XMM5, 16*4(arg2 , %r11) 586 .if \ENC_DEC == DEC 587 vmovdqa \T1, \XMM5 588 .endif 589 590 vmovdqu 16*5(arg3, %r11), \T1 591 vpxor \T1, \XMM6, \XMM6 592 vmovdqu \XMM6, 16*5(arg2 , %r11) 593 .if \ENC_DEC == DEC 594 vmovdqa \T1, \XMM6 595 .endif 596 597 vmovdqu 16*6(arg3, %r11), \T1 598 vpxor \T1, \XMM7, \XMM7 599 vmovdqu \XMM7, 16*6(arg2 , %r11) 600 .if \ENC_DEC == DEC 601 vmovdqa \T1, \XMM7 602 .endif 603 604 vmovdqu 16*7(arg3, %r11), \T1 605 vpxor \T1, \XMM8, \XMM8 606 vmovdqu \XMM8, 16*7(arg2 , %r11) 607 .if \ENC_DEC == DEC 608 vmovdqa \T1, \XMM8 609 .endif 610 611 add $128, %r11 612 613 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 614 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 615 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 616 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 617 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 618 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 619 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 620 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 621 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 622 623############################################################################### 624 625_initial_blocks_done\@: 626 627.endm 628 629# encrypt 8 blocks at a time 630# ghash the 8 previously encrypted ciphertext blocks 631# arg1, arg2, arg3 are used as pointers only, not modified 632# r11 is the data offset value 633.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 634 635 vmovdqa \XMM1, \T2 636 vmovdqa \XMM2, TMP2(%rsp) 637 vmovdqa \XMM3, TMP3(%rsp) 638 vmovdqa \XMM4, TMP4(%rsp) 639 vmovdqa \XMM5, TMP5(%rsp) 640 vmovdqa \XMM6, TMP6(%rsp) 641 vmovdqa \XMM7, TMP7(%rsp) 642 vmovdqa \XMM8, TMP8(%rsp) 643 644.if \loop_idx == in_order 645 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 646 vpaddd ONE(%rip), \XMM1, \XMM2 647 vpaddd ONE(%rip), \XMM2, \XMM3 648 vpaddd ONE(%rip), \XMM3, \XMM4 649 vpaddd ONE(%rip), \XMM4, \XMM5 650 vpaddd ONE(%rip), \XMM5, \XMM6 651 vpaddd ONE(%rip), \XMM6, \XMM7 652 vpaddd ONE(%rip), \XMM7, \XMM8 653 vmovdqa \XMM8, \CTR 654 655 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 656 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 657 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 658 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 659 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 660 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 661 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 662 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 663.else 664 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 665 vpaddd ONEf(%rip), \XMM1, \XMM2 666 vpaddd ONEf(%rip), \XMM2, \XMM3 667 vpaddd ONEf(%rip), \XMM3, \XMM4 668 vpaddd ONEf(%rip), \XMM4, \XMM5 669 vpaddd ONEf(%rip), \XMM5, \XMM6 670 vpaddd ONEf(%rip), \XMM6, \XMM7 671 vpaddd ONEf(%rip), \XMM7, \XMM8 672 vmovdqa \XMM8, \CTR 673.endif 674 675 676 ####################################################################### 677 678 vmovdqu (arg1), \T1 679 vpxor \T1, \XMM1, \XMM1 680 vpxor \T1, \XMM2, \XMM2 681 vpxor \T1, \XMM3, \XMM3 682 vpxor \T1, \XMM4, \XMM4 683 vpxor \T1, \XMM5, \XMM5 684 vpxor \T1, \XMM6, \XMM6 685 vpxor \T1, \XMM7, \XMM7 686 vpxor \T1, \XMM8, \XMM8 687 688 ####################################################################### 689 690 691 692 693 694 vmovdqu 16*1(arg1), \T1 695 vaesenc \T1, \XMM1, \XMM1 696 vaesenc \T1, \XMM2, \XMM2 697 vaesenc \T1, \XMM3, \XMM3 698 vaesenc \T1, \XMM4, \XMM4 699 vaesenc \T1, \XMM5, \XMM5 700 vaesenc \T1, \XMM6, \XMM6 701 vaesenc \T1, \XMM7, \XMM7 702 vaesenc \T1, \XMM8, \XMM8 703 704 vmovdqu 16*2(arg1), \T1 705 vaesenc \T1, \XMM1, \XMM1 706 vaesenc \T1, \XMM2, \XMM2 707 vaesenc \T1, \XMM3, \XMM3 708 vaesenc \T1, \XMM4, \XMM4 709 vaesenc \T1, \XMM5, \XMM5 710 vaesenc \T1, \XMM6, \XMM6 711 vaesenc \T1, \XMM7, \XMM7 712 vaesenc \T1, \XMM8, \XMM8 713 714 715 ####################################################################### 716 717 vmovdqa HashKey_8(arg1), \T5 718 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 719 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 720 721 vpshufd $0b01001110, \T2, \T6 722 vpxor \T2, \T6, \T6 723 724 vmovdqa HashKey_8_k(arg1), \T5 725 vpclmulqdq $0x00, \T5, \T6, \T6 726 727 vmovdqu 16*3(arg1), \T1 728 vaesenc \T1, \XMM1, \XMM1 729 vaesenc \T1, \XMM2, \XMM2 730 vaesenc \T1, \XMM3, \XMM3 731 vaesenc \T1, \XMM4, \XMM4 732 vaesenc \T1, \XMM5, \XMM5 733 vaesenc \T1, \XMM6, \XMM6 734 vaesenc \T1, \XMM7, \XMM7 735 vaesenc \T1, \XMM8, \XMM8 736 737 vmovdqa TMP2(%rsp), \T1 738 vmovdqa HashKey_7(arg1), \T5 739 vpclmulqdq $0x11, \T5, \T1, \T3 740 vpxor \T3, \T4, \T4 741 vpclmulqdq $0x00, \T5, \T1, \T3 742 vpxor \T3, \T7, \T7 743 744 vpshufd $0b01001110, \T1, \T3 745 vpxor \T1, \T3, \T3 746 vmovdqa HashKey_7_k(arg1), \T5 747 vpclmulqdq $0x10, \T5, \T3, \T3 748 vpxor \T3, \T6, \T6 749 750 vmovdqu 16*4(arg1), \T1 751 vaesenc \T1, \XMM1, \XMM1 752 vaesenc \T1, \XMM2, \XMM2 753 vaesenc \T1, \XMM3, \XMM3 754 vaesenc \T1, \XMM4, \XMM4 755 vaesenc \T1, \XMM5, \XMM5 756 vaesenc \T1, \XMM6, \XMM6 757 vaesenc \T1, \XMM7, \XMM7 758 vaesenc \T1, \XMM8, \XMM8 759 760 ####################################################################### 761 762 vmovdqa TMP3(%rsp), \T1 763 vmovdqa HashKey_6(arg1), \T5 764 vpclmulqdq $0x11, \T5, \T1, \T3 765 vpxor \T3, \T4, \T4 766 vpclmulqdq $0x00, \T5, \T1, \T3 767 vpxor \T3, \T7, \T7 768 769 vpshufd $0b01001110, \T1, \T3 770 vpxor \T1, \T3, \T3 771 vmovdqa HashKey_6_k(arg1), \T5 772 vpclmulqdq $0x10, \T5, \T3, \T3 773 vpxor \T3, \T6, \T6 774 775 vmovdqu 16*5(arg1), \T1 776 vaesenc \T1, \XMM1, \XMM1 777 vaesenc \T1, \XMM2, \XMM2 778 vaesenc \T1, \XMM3, \XMM3 779 vaesenc \T1, \XMM4, \XMM4 780 vaesenc \T1, \XMM5, \XMM5 781 vaesenc \T1, \XMM6, \XMM6 782 vaesenc \T1, \XMM7, \XMM7 783 vaesenc \T1, \XMM8, \XMM8 784 785 vmovdqa TMP4(%rsp), \T1 786 vmovdqa HashKey_5(arg1), \T5 787 vpclmulqdq $0x11, \T5, \T1, \T3 788 vpxor \T3, \T4, \T4 789 vpclmulqdq $0x00, \T5, \T1, \T3 790 vpxor \T3, \T7, \T7 791 792 vpshufd $0b01001110, \T1, \T3 793 vpxor \T1, \T3, \T3 794 vmovdqa HashKey_5_k(arg1), \T5 795 vpclmulqdq $0x10, \T5, \T3, \T3 796 vpxor \T3, \T6, \T6 797 798 vmovdqu 16*6(arg1), \T1 799 vaesenc \T1, \XMM1, \XMM1 800 vaesenc \T1, \XMM2, \XMM2 801 vaesenc \T1, \XMM3, \XMM3 802 vaesenc \T1, \XMM4, \XMM4 803 vaesenc \T1, \XMM5, \XMM5 804 vaesenc \T1, \XMM6, \XMM6 805 vaesenc \T1, \XMM7, \XMM7 806 vaesenc \T1, \XMM8, \XMM8 807 808 809 vmovdqa TMP5(%rsp), \T1 810 vmovdqa HashKey_4(arg1), \T5 811 vpclmulqdq $0x11, \T5, \T1, \T3 812 vpxor \T3, \T4, \T4 813 vpclmulqdq $0x00, \T5, \T1, \T3 814 vpxor \T3, \T7, \T7 815 816 vpshufd $0b01001110, \T1, \T3 817 vpxor \T1, \T3, \T3 818 vmovdqa HashKey_4_k(arg1), \T5 819 vpclmulqdq $0x10, \T5, \T3, \T3 820 vpxor \T3, \T6, \T6 821 822 vmovdqu 16*7(arg1), \T1 823 vaesenc \T1, \XMM1, \XMM1 824 vaesenc \T1, \XMM2, \XMM2 825 vaesenc \T1, \XMM3, \XMM3 826 vaesenc \T1, \XMM4, \XMM4 827 vaesenc \T1, \XMM5, \XMM5 828 vaesenc \T1, \XMM6, \XMM6 829 vaesenc \T1, \XMM7, \XMM7 830 vaesenc \T1, \XMM8, \XMM8 831 832 vmovdqa TMP6(%rsp), \T1 833 vmovdqa HashKey_3(arg1), \T5 834 vpclmulqdq $0x11, \T5, \T1, \T3 835 vpxor \T3, \T4, \T4 836 vpclmulqdq $0x00, \T5, \T1, \T3 837 vpxor \T3, \T7, \T7 838 839 vpshufd $0b01001110, \T1, \T3 840 vpxor \T1, \T3, \T3 841 vmovdqa HashKey_3_k(arg1), \T5 842 vpclmulqdq $0x10, \T5, \T3, \T3 843 vpxor \T3, \T6, \T6 844 845 846 vmovdqu 16*8(arg1), \T1 847 vaesenc \T1, \XMM1, \XMM1 848 vaesenc \T1, \XMM2, \XMM2 849 vaesenc \T1, \XMM3, \XMM3 850 vaesenc \T1, \XMM4, \XMM4 851 vaesenc \T1, \XMM5, \XMM5 852 vaesenc \T1, \XMM6, \XMM6 853 vaesenc \T1, \XMM7, \XMM7 854 vaesenc \T1, \XMM8, \XMM8 855 856 vmovdqa TMP7(%rsp), \T1 857 vmovdqa HashKey_2(arg1), \T5 858 vpclmulqdq $0x11, \T5, \T1, \T3 859 vpxor \T3, \T4, \T4 860 vpclmulqdq $0x00, \T5, \T1, \T3 861 vpxor \T3, \T7, \T7 862 863 vpshufd $0b01001110, \T1, \T3 864 vpxor \T1, \T3, \T3 865 vmovdqa HashKey_2_k(arg1), \T5 866 vpclmulqdq $0x10, \T5, \T3, \T3 867 vpxor \T3, \T6, \T6 868 869 ####################################################################### 870 871 vmovdqu 16*9(arg1), \T5 872 vaesenc \T5, \XMM1, \XMM1 873 vaesenc \T5, \XMM2, \XMM2 874 vaesenc \T5, \XMM3, \XMM3 875 vaesenc \T5, \XMM4, \XMM4 876 vaesenc \T5, \XMM5, \XMM5 877 vaesenc \T5, \XMM6, \XMM6 878 vaesenc \T5, \XMM7, \XMM7 879 vaesenc \T5, \XMM8, \XMM8 880 881 vmovdqa TMP8(%rsp), \T1 882 vmovdqa HashKey(arg1), \T5 883 vpclmulqdq $0x11, \T5, \T1, \T3 884 vpxor \T3, \T4, \T4 885 vpclmulqdq $0x00, \T5, \T1, \T3 886 vpxor \T3, \T7, \T7 887 888 vpshufd $0b01001110, \T1, \T3 889 vpxor \T1, \T3, \T3 890 vmovdqa HashKey_k(arg1), \T5 891 vpclmulqdq $0x10, \T5, \T3, \T3 892 vpxor \T3, \T6, \T6 893 894 vpxor \T4, \T6, \T6 895 vpxor \T7, \T6, \T6 896 897 vmovdqu 16*10(arg1), \T5 898 899 i = 0 900 j = 1 901 setreg 902.rep 8 903 vpxor 16*i(arg3, %r11), \T5, \T2 904 .if \ENC_DEC == ENC 905 vaesenclast \T2, reg_j, reg_j 906 .else 907 vaesenclast \T2, reg_j, \T3 908 vmovdqu 16*i(arg3, %r11), reg_j 909 vmovdqu \T3, 16*i(arg2, %r11) 910 .endif 911 i = (i+1) 912 j = (j+1) 913 setreg 914.endr 915 ####################################################################### 916 917 918 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 919 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 920 vpxor \T3, \T7, \T7 921 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 922 923 924 925 ####################################################################### 926 #first phase of the reduction 927 ####################################################################### 928 vpslld $31, \T7, \T2 # packed right shifting << 31 929 vpslld $30, \T7, \T3 # packed right shifting shift << 30 930 vpslld $25, \T7, \T4 # packed right shifting shift << 25 931 932 vpxor \T3, \T2, \T2 # xor the shifted versions 933 vpxor \T4, \T2, \T2 934 935 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 936 937 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 938 vpxor \T2, \T7, \T7 # first phase of the reduction complete 939 ####################################################################### 940 .if \ENC_DEC == ENC 941 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer 942 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer 943 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer 944 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer 945 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer 946 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer 947 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer 948 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer 949 .endif 950 951 ####################################################################### 952 #second phase of the reduction 953 vpsrld $1, \T7, \T2 # packed left shifting >> 1 954 vpsrld $2, \T7, \T3 # packed left shifting >> 2 955 vpsrld $7, \T7, \T4 # packed left shifting >> 7 956 vpxor \T3, \T2, \T2 # xor the shifted versions 957 vpxor \T4, \T2, \T2 958 959 vpxor \T1, \T2, \T2 960 vpxor \T2, \T7, \T7 961 vpxor \T7, \T6, \T6 # the result is in T6 962 ####################################################################### 963 964 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 965 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 966 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 967 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 968 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 969 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 970 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 971 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 972 973 974 vpxor \T6, \XMM1, \XMM1 975 976 977 978.endm 979 980 981# GHASH the last 4 ciphertext blocks. 982.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 983 984 ## Karatsuba Method 985 986 987 vpshufd $0b01001110, \XMM1, \T2 988 vpxor \XMM1, \T2, \T2 989 vmovdqa HashKey_8(arg1), \T5 990 vpclmulqdq $0x11, \T5, \XMM1, \T6 991 vpclmulqdq $0x00, \T5, \XMM1, \T7 992 993 vmovdqa HashKey_8_k(arg1), \T3 994 vpclmulqdq $0x00, \T3, \T2, \XMM1 995 996 ###################### 997 998 vpshufd $0b01001110, \XMM2, \T2 999 vpxor \XMM2, \T2, \T2 1000 vmovdqa HashKey_7(arg1), \T5 1001 vpclmulqdq $0x11, \T5, \XMM2, \T4 1002 vpxor \T4, \T6, \T6 1003 1004 vpclmulqdq $0x00, \T5, \XMM2, \T4 1005 vpxor \T4, \T7, \T7 1006 1007 vmovdqa HashKey_7_k(arg1), \T3 1008 vpclmulqdq $0x00, \T3, \T2, \T2 1009 vpxor \T2, \XMM1, \XMM1 1010 1011 ###################### 1012 1013 vpshufd $0b01001110, \XMM3, \T2 1014 vpxor \XMM3, \T2, \T2 1015 vmovdqa HashKey_6(arg1), \T5 1016 vpclmulqdq $0x11, \T5, \XMM3, \T4 1017 vpxor \T4, \T6, \T6 1018 1019 vpclmulqdq $0x00, \T5, \XMM3, \T4 1020 vpxor \T4, \T7, \T7 1021 1022 vmovdqa HashKey_6_k(arg1), \T3 1023 vpclmulqdq $0x00, \T3, \T2, \T2 1024 vpxor \T2, \XMM1, \XMM1 1025 1026 ###################### 1027 1028 vpshufd $0b01001110, \XMM4, \T2 1029 vpxor \XMM4, \T2, \T2 1030 vmovdqa HashKey_5(arg1), \T5 1031 vpclmulqdq $0x11, \T5, \XMM4, \T4 1032 vpxor \T4, \T6, \T6 1033 1034 vpclmulqdq $0x00, \T5, \XMM4, \T4 1035 vpxor \T4, \T7, \T7 1036 1037 vmovdqa HashKey_5_k(arg1), \T3 1038 vpclmulqdq $0x00, \T3, \T2, \T2 1039 vpxor \T2, \XMM1, \XMM1 1040 1041 ###################### 1042 1043 vpshufd $0b01001110, \XMM5, \T2 1044 vpxor \XMM5, \T2, \T2 1045 vmovdqa HashKey_4(arg1), \T5 1046 vpclmulqdq $0x11, \T5, \XMM5, \T4 1047 vpxor \T4, \T6, \T6 1048 1049 vpclmulqdq $0x00, \T5, \XMM5, \T4 1050 vpxor \T4, \T7, \T7 1051 1052 vmovdqa HashKey_4_k(arg1), \T3 1053 vpclmulqdq $0x00, \T3, \T2, \T2 1054 vpxor \T2, \XMM1, \XMM1 1055 1056 ###################### 1057 1058 vpshufd $0b01001110, \XMM6, \T2 1059 vpxor \XMM6, \T2, \T2 1060 vmovdqa HashKey_3(arg1), \T5 1061 vpclmulqdq $0x11, \T5, \XMM6, \T4 1062 vpxor \T4, \T6, \T6 1063 1064 vpclmulqdq $0x00, \T5, \XMM6, \T4 1065 vpxor \T4, \T7, \T7 1066 1067 vmovdqa HashKey_3_k(arg1), \T3 1068 vpclmulqdq $0x00, \T3, \T2, \T2 1069 vpxor \T2, \XMM1, \XMM1 1070 1071 ###################### 1072 1073 vpshufd $0b01001110, \XMM7, \T2 1074 vpxor \XMM7, \T2, \T2 1075 vmovdqa HashKey_2(arg1), \T5 1076 vpclmulqdq $0x11, \T5, \XMM7, \T4 1077 vpxor \T4, \T6, \T6 1078 1079 vpclmulqdq $0x00, \T5, \XMM7, \T4 1080 vpxor \T4, \T7, \T7 1081 1082 vmovdqa HashKey_2_k(arg1), \T3 1083 vpclmulqdq $0x00, \T3, \T2, \T2 1084 vpxor \T2, \XMM1, \XMM1 1085 1086 ###################### 1087 1088 vpshufd $0b01001110, \XMM8, \T2 1089 vpxor \XMM8, \T2, \T2 1090 vmovdqa HashKey(arg1), \T5 1091 vpclmulqdq $0x11, \T5, \XMM8, \T4 1092 vpxor \T4, \T6, \T6 1093 1094 vpclmulqdq $0x00, \T5, \XMM8, \T4 1095 vpxor \T4, \T7, \T7 1096 1097 vmovdqa HashKey_k(arg1), \T3 1098 vpclmulqdq $0x00, \T3, \T2, \T2 1099 1100 vpxor \T2, \XMM1, \XMM1 1101 vpxor \T6, \XMM1, \XMM1 1102 vpxor \T7, \XMM1, \T2 1103 1104 1105 1106 1107 vpslldq $8, \T2, \T4 1108 vpsrldq $8, \T2, \T2 1109 1110 vpxor \T4, \T7, \T7 1111 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1112 # the accumulated carry-less multiplications 1113 1114 ####################################################################### 1115 #first phase of the reduction 1116 vpslld $31, \T7, \T2 # packed right shifting << 31 1117 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1118 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1119 1120 vpxor \T3, \T2, \T2 # xor the shifted versions 1121 vpxor \T4, \T2, \T2 1122 1123 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1124 1125 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1126 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1127 ####################################################################### 1128 1129 1130 #second phase of the reduction 1131 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1132 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1133 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1134 vpxor \T3, \T2, \T2 # xor the shifted versions 1135 vpxor \T4, \T2, \T2 1136 1137 vpxor \T1, \T2, \T2 1138 vpxor \T2, \T7, \T7 1139 vpxor \T7, \T6, \T6 # the result is in T6 1140 1141.endm 1142 1143 1144# combined for GCM encrypt and decrypt functions 1145# clobbering all xmm registers 1146# clobbering r10, r11, r12, r13, r14, r15 1147.macro GCM_ENC_DEC_AVX ENC_DEC 1148 1149 #the number of pushes must equal STACK_OFFSET 1150 push %r12 1151 push %r13 1152 push %r14 1153 push %r15 1154 1155 mov %rsp, %r14 1156 1157 1158 1159 1160 sub $VARIABLE_OFFSET, %rsp 1161 and $~63, %rsp # align rsp to 64 bytes 1162 1163 1164 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey 1165 1166 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext 1167 and $-16, %r13 # r13 = r13 - (r13 mod 16) 1168 1169 mov %r13, %r12 1170 shr $4, %r12 1171 and $7, %r12 1172 jz _initial_num_blocks_is_0\@ 1173 1174 cmp $7, %r12 1175 je _initial_num_blocks_is_7\@ 1176 cmp $6, %r12 1177 je _initial_num_blocks_is_6\@ 1178 cmp $5, %r12 1179 je _initial_num_blocks_is_5\@ 1180 cmp $4, %r12 1181 je _initial_num_blocks_is_4\@ 1182 cmp $3, %r12 1183 je _initial_num_blocks_is_3\@ 1184 cmp $2, %r12 1185 je _initial_num_blocks_is_2\@ 1186 1187 jmp _initial_num_blocks_is_1\@ 1188 1189_initial_num_blocks_is_7\@: 1190 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1191 sub $16*7, %r13 1192 jmp _initial_blocks_encrypted\@ 1193 1194_initial_num_blocks_is_6\@: 1195 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1196 sub $16*6, %r13 1197 jmp _initial_blocks_encrypted\@ 1198 1199_initial_num_blocks_is_5\@: 1200 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1201 sub $16*5, %r13 1202 jmp _initial_blocks_encrypted\@ 1203 1204_initial_num_blocks_is_4\@: 1205 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1206 sub $16*4, %r13 1207 jmp _initial_blocks_encrypted\@ 1208 1209_initial_num_blocks_is_3\@: 1210 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1211 sub $16*3, %r13 1212 jmp _initial_blocks_encrypted\@ 1213 1214_initial_num_blocks_is_2\@: 1215 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1216 sub $16*2, %r13 1217 jmp _initial_blocks_encrypted\@ 1218 1219_initial_num_blocks_is_1\@: 1220 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1221 sub $16*1, %r13 1222 jmp _initial_blocks_encrypted\@ 1223 1224_initial_num_blocks_is_0\@: 1225 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1226 1227 1228_initial_blocks_encrypted\@: 1229 cmp $0, %r13 1230 je _zero_cipher_left\@ 1231 1232 sub $128, %r13 1233 je _eight_cipher_left\@ 1234 1235 1236 1237 1238 vmovd %xmm9, %r15d 1239 and $255, %r15d 1240 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1241 1242 1243_encrypt_by_8_new\@: 1244 cmp $(255-8), %r15d 1245 jg _encrypt_by_8\@ 1246 1247 1248 1249 add $8, %r15b 1250 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 1251 add $128, %r11 1252 sub $128, %r13 1253 jne _encrypt_by_8_new\@ 1254 1255 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1256 jmp _eight_cipher_left\@ 1257 1258_encrypt_by_8\@: 1259 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1260 add $8, %r15b 1261 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 1262 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1263 add $128, %r11 1264 sub $128, %r13 1265 jne _encrypt_by_8_new\@ 1266 1267 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1268 1269 1270 1271 1272_eight_cipher_left\@: 1273 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 1274 1275 1276_zero_cipher_left\@: 1277 cmp $16, arg4 1278 jl _only_less_than_16\@ 1279 1280 mov arg4, %r13 1281 and $15, %r13 # r13 = (arg4 mod 16) 1282 1283 je _multiple_of_16_bytes\@ 1284 1285 # handle the last <16 Byte block seperately 1286 1287 1288 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 1289 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1290 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 1291 1292 sub $16, %r11 1293 add %r13, %r11 1294 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block 1295 1296 lea SHIFT_MASK+16(%rip), %r12 1297 sub %r13, %r12 # adjust the shuffle mask pointer to be 1298 # able to shift 16-r13 bytes (r13 is the 1299 # number of bytes in plaintext mod 16) 1300 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 1301 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes 1302 jmp _final_ghash_mul\@ 1303 1304_only_less_than_16\@: 1305 # check for 0 length 1306 mov arg4, %r13 1307 and $15, %r13 # r13 = (arg4 mod 16) 1308 1309 je _multiple_of_16_bytes\@ 1310 1311 # handle the last <16 Byte block seperately 1312 1313 1314 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 1315 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1316 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 1317 1318 1319 lea SHIFT_MASK+16(%rip), %r12 1320 sub %r13, %r12 # adjust the shuffle mask pointer to be 1321 # able to shift 16-r13 bytes (r13 is the 1322 # number of bytes in plaintext mod 16) 1323 1324_get_last_16_byte_loop\@: 1325 movb (arg3, %r11), %al 1326 movb %al, TMP1 (%rsp , %r11) 1327 add $1, %r11 1328 cmp %r13, %r11 1329 jne _get_last_16_byte_loop\@ 1330 1331 vmovdqu TMP1(%rsp), %xmm1 1332 1333 sub $16, %r11 1334 1335_final_ghash_mul\@: 1336 .if \ENC_DEC == DEC 1337 vmovdqa %xmm1, %xmm2 1338 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 1339 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 1340 # mask out top 16-r13 bytes of xmm9 1341 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 1342 vpand %xmm1, %xmm2, %xmm2 1343 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 1344 vpxor %xmm2, %xmm14, %xmm14 1345 #GHASH computation for the last <16 Byte block 1346 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 1347 sub %r13, %r11 1348 add $16, %r11 1349 .else 1350 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 1351 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 1352 # mask out top 16-r13 bytes of xmm9 1353 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 1354 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1355 vpxor %xmm9, %xmm14, %xmm14 1356 #GHASH computation for the last <16 Byte block 1357 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 1358 sub %r13, %r11 1359 add $16, %r11 1360 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 1361 .endif 1362 1363 1364 ############################# 1365 # output r13 Bytes 1366 vmovq %xmm9, %rax 1367 cmp $8, %r13 1368 jle _less_than_8_bytes_left\@ 1369 1370 mov %rax, (arg2 , %r11) 1371 add $8, %r11 1372 vpsrldq $8, %xmm9, %xmm9 1373 vmovq %xmm9, %rax 1374 sub $8, %r13 1375 1376_less_than_8_bytes_left\@: 1377 movb %al, (arg2 , %r11) 1378 add $1, %r11 1379 shr $8, %rax 1380 sub $1, %r13 1381 jne _less_than_8_bytes_left\@ 1382 ############################# 1383 1384_multiple_of_16_bytes\@: 1385 mov arg7, %r12 # r12 = aadLen (number of bytes) 1386 shl $3, %r12 # convert into number of bits 1387 vmovd %r12d, %xmm15 # len(A) in xmm15 1388 1389 shl $3, arg4 # len(C) in bits (*128) 1390 vmovq arg4, %xmm1 1391 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 1392 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 1393 1394 vpxor %xmm15, %xmm14, %xmm14 1395 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 1396 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 1397 1398 mov arg5, %rax # rax = *Y0 1399 vmovdqu (%rax), %xmm9 # xmm9 = Y0 1400 1401 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) 1402 1403 vpxor %xmm14, %xmm9, %xmm9 1404 1405 1406 1407_return_T\@: 1408 mov arg8, %r10 # r10 = authTag 1409 mov arg9, %r11 # r11 = auth_tag_len 1410 1411 cmp $16, %r11 1412 je _T_16\@ 1413 1414 cmp $12, %r11 1415 je _T_12\@ 1416 1417_T_8\@: 1418 vmovq %xmm9, %rax 1419 mov %rax, (%r10) 1420 jmp _return_T_done\@ 1421_T_12\@: 1422 vmovq %xmm9, %rax 1423 mov %rax, (%r10) 1424 vpsrldq $8, %xmm9, %xmm9 1425 vmovd %xmm9, %eax 1426 mov %eax, 8(%r10) 1427 jmp _return_T_done\@ 1428 1429_T_16\@: 1430 vmovdqu %xmm9, (%r10) 1431 1432_return_T_done\@: 1433 mov %r14, %rsp 1434 1435 pop %r15 1436 pop %r14 1437 pop %r13 1438 pop %r12 1439.endm 1440 1441 1442############################################################# 1443#void aesni_gcm_precomp_avx_gen2 1444# (gcm_data *my_ctx_data, 1445# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1446############################################################# 1447ENTRY(aesni_gcm_precomp_avx_gen2) 1448 #the number of pushes must equal STACK_OFFSET 1449 push %r12 1450 push %r13 1451 push %r14 1452 push %r15 1453 1454 mov %rsp, %r14 1455 1456 1457 1458 sub $VARIABLE_OFFSET, %rsp 1459 and $~63, %rsp # align rsp to 64 bytes 1460 1461 vmovdqu (arg2), %xmm6 # xmm6 = HashKey 1462 1463 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 1464 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 1465 vmovdqa %xmm6, %xmm2 1466 vpsllq $1, %xmm6, %xmm6 1467 vpsrlq $63, %xmm2, %xmm2 1468 vmovdqa %xmm2, %xmm1 1469 vpslldq $8, %xmm2, %xmm2 1470 vpsrldq $8, %xmm1, %xmm1 1471 vpor %xmm2, %xmm6, %xmm6 1472 #reduction 1473 vpshufd $0b00100100, %xmm1, %xmm2 1474 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 1475 vpand POLY(%rip), %xmm2, %xmm2 1476 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 1477 ####################################################################### 1478 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly 1479 1480 1481 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 1482 1483 mov %r14, %rsp 1484 1485 pop %r15 1486 pop %r14 1487 pop %r13 1488 pop %r12 1489 ret 1490ENDPROC(aesni_gcm_precomp_avx_gen2) 1491 1492############################################################################### 1493#void aesni_gcm_enc_avx_gen2( 1494# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1495# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1496# const u8 *in, /* Plaintext input */ 1497# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 1498# u8 *iv, /* Pre-counter block j0: 4 byte salt 1499# (from Security Association) concatenated with 8 byte 1500# Initialisation Vector (from IPSec ESP Payload) 1501# concatenated with 0x00000001. 16-byte aligned pointer. */ 1502# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1503# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1504# u8 *auth_tag, /* Authenticated Tag output. */ 1505# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1506# Valid values are 16 (most likely), 12 or 8. */ 1507############################################################################### 1508ENTRY(aesni_gcm_enc_avx_gen2) 1509 GCM_ENC_DEC_AVX ENC 1510 ret 1511ENDPROC(aesni_gcm_enc_avx_gen2) 1512 1513############################################################################### 1514#void aesni_gcm_dec_avx_gen2( 1515# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1516# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1517# const u8 *in, /* Ciphertext input */ 1518# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 1519# u8 *iv, /* Pre-counter block j0: 4 byte salt 1520# (from Security Association) concatenated with 8 byte 1521# Initialisation Vector (from IPSec ESP Payload) 1522# concatenated with 0x00000001. 16-byte aligned pointer. */ 1523# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1524# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1525# u8 *auth_tag, /* Authenticated Tag output. */ 1526# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1527# Valid values are 16 (most likely), 12 or 8. */ 1528############################################################################### 1529ENTRY(aesni_gcm_dec_avx_gen2) 1530 GCM_ENC_DEC_AVX DEC 1531 ret 1532ENDPROC(aesni_gcm_dec_avx_gen2) 1533#endif /* CONFIG_AS_AVX */ 1534 1535#ifdef CONFIG_AS_AVX2 1536############################################################################### 1537# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1538# Input: A and B (128-bits each, bit-reflected) 1539# Output: C = A*B*x mod poly, (i.e. >>1 ) 1540# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1541# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1542############################################################################### 1543.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1544 1545 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1546 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1547 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1548 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1549 vpxor \T3, \GH, \GH 1550 1551 1552 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1553 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1554 1555 vpxor \T3, \T1, \T1 1556 vpxor \T2, \GH, \GH 1557 1558 ####################################################################### 1559 #first phase of the reduction 1560 vmovdqa POLY2(%rip), \T3 1561 1562 vpclmulqdq $0x01, \GH, \T3, \T2 1563 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1564 1565 vpxor \T2, \GH, \GH # first phase of the reduction complete 1566 ####################################################################### 1567 #second phase of the reduction 1568 vpclmulqdq $0x00, \GH, \T3, \T2 1569 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1570 1571 vpclmulqdq $0x10, \GH, \T3, \GH 1572 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1573 1574 vpxor \T2, \GH, \GH # second phase of the reduction complete 1575 ####################################################################### 1576 vpxor \T1, \GH, \GH # the result is in GH 1577 1578 1579.endm 1580 1581.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1582 1583 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1584 vmovdqa \HK, \T5 1585 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1586 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly 1587 1588 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1589 vmovdqa \T5, HashKey_3(arg1) 1590 1591 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1592 vmovdqa \T5, HashKey_4(arg1) 1593 1594 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1595 vmovdqa \T5, HashKey_5(arg1) 1596 1597 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1598 vmovdqa \T5, HashKey_6(arg1) 1599 1600 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1601 vmovdqa \T5, HashKey_7(arg1) 1602 1603 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1604 vmovdqa \T5, HashKey_8(arg1) 1605 1606.endm 1607 1608 1609## if a = number of total plaintext bytes 1610## b = floor(a/16) 1611## num_initial_blocks = b mod 4# 1612## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1613## r10, r11, r12, rax are clobbered 1614## arg1, arg2, arg3, r14 are used as a pointer only, not modified 1615 1616.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1617 i = (8-\num_initial_blocks) 1618 setreg 1619 1620 mov arg6, %r10 # r10 = AAD 1621 mov arg7, %r12 # r12 = aadLen 1622 1623 1624 mov %r12, %r11 1625 1626 vpxor reg_i, reg_i, reg_i 1627_get_AAD_loop\@: 1628 vmovd (%r10), \T1 1629 vpslldq $12, \T1, \T1 1630 vpsrldq $4, reg_i, reg_i 1631 vpxor \T1, reg_i, reg_i 1632 1633 add $4, %r10 1634 sub $4, %r12 1635 jg _get_AAD_loop\@ 1636 1637 1638 cmp $16, %r11 1639 je _get_AAD_loop2_done\@ 1640 mov $16, %r12 1641 1642_get_AAD_loop2\@: 1643 vpsrldq $4, reg_i, reg_i 1644 sub $4, %r12 1645 cmp %r11, %r12 1646 jg _get_AAD_loop2\@ 1647 1648_get_AAD_loop2_done\@: 1649 1650 #byte-reflect the AAD data 1651 vpshufb SHUF_MASK(%rip), reg_i, reg_i 1652 1653 # initialize the data pointer offset as zero 1654 xor %r11, %r11 1655 1656 # start AES for num_initial_blocks blocks 1657 mov arg5, %rax # rax = *Y0 1658 vmovdqu (%rax), \CTR # CTR = Y0 1659 vpshufb SHUF_MASK(%rip), \CTR, \CTR 1660 1661 1662 i = (9-\num_initial_blocks) 1663 setreg 1664.rep \num_initial_blocks 1665 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1666 vmovdqa \CTR, reg_i 1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1668 i = (i+1) 1669 setreg 1670.endr 1671 1672 vmovdqa (arg1), \T_key 1673 i = (9-\num_initial_blocks) 1674 setreg 1675.rep \num_initial_blocks 1676 vpxor \T_key, reg_i, reg_i 1677 i = (i+1) 1678 setreg 1679.endr 1680 1681 j = 1 1682 setreg 1683.rep 9 1684 vmovdqa 16*j(arg1), \T_key 1685 i = (9-\num_initial_blocks) 1686 setreg 1687.rep \num_initial_blocks 1688 vaesenc \T_key, reg_i, reg_i 1689 i = (i+1) 1690 setreg 1691.endr 1692 1693 j = (j+1) 1694 setreg 1695.endr 1696 1697 1698 vmovdqa 16*10(arg1), \T_key 1699 i = (9-\num_initial_blocks) 1700 setreg 1701.rep \num_initial_blocks 1702 vaesenclast \T_key, reg_i, reg_i 1703 i = (i+1) 1704 setreg 1705.endr 1706 1707 i = (9-\num_initial_blocks) 1708 setreg 1709.rep \num_initial_blocks 1710 vmovdqu (arg3, %r11), \T1 1711 vpxor \T1, reg_i, reg_i 1712 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for 1713 # num_initial_blocks blocks 1714 add $16, %r11 1715.if \ENC_DEC == DEC 1716 vmovdqa \T1, reg_i 1717.endif 1718 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1719 i = (i+1) 1720 setreg 1721.endr 1722 1723 1724 i = (8-\num_initial_blocks) 1725 j = (9-\num_initial_blocks) 1726 setreg 1727 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 1728 1729.rep \num_initial_blocks 1730 vpxor reg_i, reg_j, reg_j 1731 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1732 i = (i+1) 1733 j = (j+1) 1734 setreg 1735.endr 1736 # XMM8 has the combined result here 1737 1738 vmovdqa \XMM8, TMP1(%rsp) 1739 vmovdqa \XMM8, \T3 1740 1741 cmp $128, %r13 1742 jl _initial_blocks_done\@ # no need for precomputed constants 1743 1744############################################################################### 1745# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1746 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1747 vmovdqa \CTR, \XMM1 1748 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1749 1750 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1751 vmovdqa \CTR, \XMM2 1752 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1753 1754 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1755 vmovdqa \CTR, \XMM3 1756 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1757 1758 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1759 vmovdqa \CTR, \XMM4 1760 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1761 1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1763 vmovdqa \CTR, \XMM5 1764 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1765 1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1767 vmovdqa \CTR, \XMM6 1768 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1769 1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1771 vmovdqa \CTR, \XMM7 1772 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1773 1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1775 vmovdqa \CTR, \XMM8 1776 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1777 1778 vmovdqa (arg1), \T_key 1779 vpxor \T_key, \XMM1, \XMM1 1780 vpxor \T_key, \XMM2, \XMM2 1781 vpxor \T_key, \XMM3, \XMM3 1782 vpxor \T_key, \XMM4, \XMM4 1783 vpxor \T_key, \XMM5, \XMM5 1784 vpxor \T_key, \XMM6, \XMM6 1785 vpxor \T_key, \XMM7, \XMM7 1786 vpxor \T_key, \XMM8, \XMM8 1787 1788 i = 1 1789 setreg 1790.rep 9 # do 9 rounds 1791 vmovdqa 16*i(arg1), \T_key 1792 vaesenc \T_key, \XMM1, \XMM1 1793 vaesenc \T_key, \XMM2, \XMM2 1794 vaesenc \T_key, \XMM3, \XMM3 1795 vaesenc \T_key, \XMM4, \XMM4 1796 vaesenc \T_key, \XMM5, \XMM5 1797 vaesenc \T_key, \XMM6, \XMM6 1798 vaesenc \T_key, \XMM7, \XMM7 1799 vaesenc \T_key, \XMM8, \XMM8 1800 i = (i+1) 1801 setreg 1802.endr 1803 1804 1805 vmovdqa 16*i(arg1), \T_key 1806 vaesenclast \T_key, \XMM1, \XMM1 1807 vaesenclast \T_key, \XMM2, \XMM2 1808 vaesenclast \T_key, \XMM3, \XMM3 1809 vaesenclast \T_key, \XMM4, \XMM4 1810 vaesenclast \T_key, \XMM5, \XMM5 1811 vaesenclast \T_key, \XMM6, \XMM6 1812 vaesenclast \T_key, \XMM7, \XMM7 1813 vaesenclast \T_key, \XMM8, \XMM8 1814 1815 vmovdqu (arg3, %r11), \T1 1816 vpxor \T1, \XMM1, \XMM1 1817 vmovdqu \XMM1, (arg2 , %r11) 1818 .if \ENC_DEC == DEC 1819 vmovdqa \T1, \XMM1 1820 .endif 1821 1822 vmovdqu 16*1(arg3, %r11), \T1 1823 vpxor \T1, \XMM2, \XMM2 1824 vmovdqu \XMM2, 16*1(arg2 , %r11) 1825 .if \ENC_DEC == DEC 1826 vmovdqa \T1, \XMM2 1827 .endif 1828 1829 vmovdqu 16*2(arg3, %r11), \T1 1830 vpxor \T1, \XMM3, \XMM3 1831 vmovdqu \XMM3, 16*2(arg2 , %r11) 1832 .if \ENC_DEC == DEC 1833 vmovdqa \T1, \XMM3 1834 .endif 1835 1836 vmovdqu 16*3(arg3, %r11), \T1 1837 vpxor \T1, \XMM4, \XMM4 1838 vmovdqu \XMM4, 16*3(arg2 , %r11) 1839 .if \ENC_DEC == DEC 1840 vmovdqa \T1, \XMM4 1841 .endif 1842 1843 vmovdqu 16*4(arg3, %r11), \T1 1844 vpxor \T1, \XMM5, \XMM5 1845 vmovdqu \XMM5, 16*4(arg2 , %r11) 1846 .if \ENC_DEC == DEC 1847 vmovdqa \T1, \XMM5 1848 .endif 1849 1850 vmovdqu 16*5(arg3, %r11), \T1 1851 vpxor \T1, \XMM6, \XMM6 1852 vmovdqu \XMM6, 16*5(arg2 , %r11) 1853 .if \ENC_DEC == DEC 1854 vmovdqa \T1, \XMM6 1855 .endif 1856 1857 vmovdqu 16*6(arg3, %r11), \T1 1858 vpxor \T1, \XMM7, \XMM7 1859 vmovdqu \XMM7, 16*6(arg2 , %r11) 1860 .if \ENC_DEC == DEC 1861 vmovdqa \T1, \XMM7 1862 .endif 1863 1864 vmovdqu 16*7(arg3, %r11), \T1 1865 vpxor \T1, \XMM8, \XMM8 1866 vmovdqu \XMM8, 16*7(arg2 , %r11) 1867 .if \ENC_DEC == DEC 1868 vmovdqa \T1, \XMM8 1869 .endif 1870 1871 add $128, %r11 1872 1873 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1874 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 1875 # the corresponding ciphertext 1876 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1877 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1879 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1880 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1881 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1882 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1883 1884############################################################################### 1885 1886_initial_blocks_done\@: 1887 1888 1889.endm 1890 1891 1892 1893# encrypt 8 blocks at a time 1894# ghash the 8 previously encrypted ciphertext blocks 1895# arg1, arg2, arg3 are used as pointers only, not modified 1896# r11 is the data offset value 1897.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1898 1899 vmovdqa \XMM1, \T2 1900 vmovdqa \XMM2, TMP2(%rsp) 1901 vmovdqa \XMM3, TMP3(%rsp) 1902 vmovdqa \XMM4, TMP4(%rsp) 1903 vmovdqa \XMM5, TMP5(%rsp) 1904 vmovdqa \XMM6, TMP6(%rsp) 1905 vmovdqa \XMM7, TMP7(%rsp) 1906 vmovdqa \XMM8, TMP8(%rsp) 1907 1908.if \loop_idx == in_order 1909 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1910 vpaddd ONE(%rip), \XMM1, \XMM2 1911 vpaddd ONE(%rip), \XMM2, \XMM3 1912 vpaddd ONE(%rip), \XMM3, \XMM4 1913 vpaddd ONE(%rip), \XMM4, \XMM5 1914 vpaddd ONE(%rip), \XMM5, \XMM6 1915 vpaddd ONE(%rip), \XMM6, \XMM7 1916 vpaddd ONE(%rip), \XMM7, \XMM8 1917 vmovdqa \XMM8, \CTR 1918 1919 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1920 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1921 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1922 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1923 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1924 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1925 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1926 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1927.else 1928 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1929 vpaddd ONEf(%rip), \XMM1, \XMM2 1930 vpaddd ONEf(%rip), \XMM2, \XMM3 1931 vpaddd ONEf(%rip), \XMM3, \XMM4 1932 vpaddd ONEf(%rip), \XMM4, \XMM5 1933 vpaddd ONEf(%rip), \XMM5, \XMM6 1934 vpaddd ONEf(%rip), \XMM6, \XMM7 1935 vpaddd ONEf(%rip), \XMM7, \XMM8 1936 vmovdqa \XMM8, \CTR 1937.endif 1938 1939 1940 ####################################################################### 1941 1942 vmovdqu (arg1), \T1 1943 vpxor \T1, \XMM1, \XMM1 1944 vpxor \T1, \XMM2, \XMM2 1945 vpxor \T1, \XMM3, \XMM3 1946 vpxor \T1, \XMM4, \XMM4 1947 vpxor \T1, \XMM5, \XMM5 1948 vpxor \T1, \XMM6, \XMM6 1949 vpxor \T1, \XMM7, \XMM7 1950 vpxor \T1, \XMM8, \XMM8 1951 1952 ####################################################################### 1953 1954 1955 1956 1957 1958 vmovdqu 16*1(arg1), \T1 1959 vaesenc \T1, \XMM1, \XMM1 1960 vaesenc \T1, \XMM2, \XMM2 1961 vaesenc \T1, \XMM3, \XMM3 1962 vaesenc \T1, \XMM4, \XMM4 1963 vaesenc \T1, \XMM5, \XMM5 1964 vaesenc \T1, \XMM6, \XMM6 1965 vaesenc \T1, \XMM7, \XMM7 1966 vaesenc \T1, \XMM8, \XMM8 1967 1968 vmovdqu 16*2(arg1), \T1 1969 vaesenc \T1, \XMM1, \XMM1 1970 vaesenc \T1, \XMM2, \XMM2 1971 vaesenc \T1, \XMM3, \XMM3 1972 vaesenc \T1, \XMM4, \XMM4 1973 vaesenc \T1, \XMM5, \XMM5 1974 vaesenc \T1, \XMM6, \XMM6 1975 vaesenc \T1, \XMM7, \XMM7 1976 vaesenc \T1, \XMM8, \XMM8 1977 1978 1979 ####################################################################### 1980 1981 vmovdqa HashKey_8(arg1), \T5 1982 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1983 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1984 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 1985 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 1986 vpxor \T5, \T6, \T6 1987 1988 vmovdqu 16*3(arg1), \T1 1989 vaesenc \T1, \XMM1, \XMM1 1990 vaesenc \T1, \XMM2, \XMM2 1991 vaesenc \T1, \XMM3, \XMM3 1992 vaesenc \T1, \XMM4, \XMM4 1993 vaesenc \T1, \XMM5, \XMM5 1994 vaesenc \T1, \XMM6, \XMM6 1995 vaesenc \T1, \XMM7, \XMM7 1996 vaesenc \T1, \XMM8, \XMM8 1997 1998 vmovdqa TMP2(%rsp), \T1 1999 vmovdqa HashKey_7(arg1), \T5 2000 vpclmulqdq $0x11, \T5, \T1, \T3 2001 vpxor \T3, \T4, \T4 2002 2003 vpclmulqdq $0x00, \T5, \T1, \T3 2004 vpxor \T3, \T7, \T7 2005 2006 vpclmulqdq $0x01, \T5, \T1, \T3 2007 vpxor \T3, \T6, \T6 2008 2009 vpclmulqdq $0x10, \T5, \T1, \T3 2010 vpxor \T3, \T6, \T6 2011 2012 vmovdqu 16*4(arg1), \T1 2013 vaesenc \T1, \XMM1, \XMM1 2014 vaesenc \T1, \XMM2, \XMM2 2015 vaesenc \T1, \XMM3, \XMM3 2016 vaesenc \T1, \XMM4, \XMM4 2017 vaesenc \T1, \XMM5, \XMM5 2018 vaesenc \T1, \XMM6, \XMM6 2019 vaesenc \T1, \XMM7, \XMM7 2020 vaesenc \T1, \XMM8, \XMM8 2021 2022 ####################################################################### 2023 2024 vmovdqa TMP3(%rsp), \T1 2025 vmovdqa HashKey_6(arg1), \T5 2026 vpclmulqdq $0x11, \T5, \T1, \T3 2027 vpxor \T3, \T4, \T4 2028 2029 vpclmulqdq $0x00, \T5, \T1, \T3 2030 vpxor \T3, \T7, \T7 2031 2032 vpclmulqdq $0x01, \T5, \T1, \T3 2033 vpxor \T3, \T6, \T6 2034 2035 vpclmulqdq $0x10, \T5, \T1, \T3 2036 vpxor \T3, \T6, \T6 2037 2038 vmovdqu 16*5(arg1), \T1 2039 vaesenc \T1, \XMM1, \XMM1 2040 vaesenc \T1, \XMM2, \XMM2 2041 vaesenc \T1, \XMM3, \XMM3 2042 vaesenc \T1, \XMM4, \XMM4 2043 vaesenc \T1, \XMM5, \XMM5 2044 vaesenc \T1, \XMM6, \XMM6 2045 vaesenc \T1, \XMM7, \XMM7 2046 vaesenc \T1, \XMM8, \XMM8 2047 2048 vmovdqa TMP4(%rsp), \T1 2049 vmovdqa HashKey_5(arg1), \T5 2050 vpclmulqdq $0x11, \T5, \T1, \T3 2051 vpxor \T3, \T4, \T4 2052 2053 vpclmulqdq $0x00, \T5, \T1, \T3 2054 vpxor \T3, \T7, \T7 2055 2056 vpclmulqdq $0x01, \T5, \T1, \T3 2057 vpxor \T3, \T6, \T6 2058 2059 vpclmulqdq $0x10, \T5, \T1, \T3 2060 vpxor \T3, \T6, \T6 2061 2062 vmovdqu 16*6(arg1), \T1 2063 vaesenc \T1, \XMM1, \XMM1 2064 vaesenc \T1, \XMM2, \XMM2 2065 vaesenc \T1, \XMM3, \XMM3 2066 vaesenc \T1, \XMM4, \XMM4 2067 vaesenc \T1, \XMM5, \XMM5 2068 vaesenc \T1, \XMM6, \XMM6 2069 vaesenc \T1, \XMM7, \XMM7 2070 vaesenc \T1, \XMM8, \XMM8 2071 2072 2073 vmovdqa TMP5(%rsp), \T1 2074 vmovdqa HashKey_4(arg1), \T5 2075 vpclmulqdq $0x11, \T5, \T1, \T3 2076 vpxor \T3, \T4, \T4 2077 2078 vpclmulqdq $0x00, \T5, \T1, \T3 2079 vpxor \T3, \T7, \T7 2080 2081 vpclmulqdq $0x01, \T5, \T1, \T3 2082 vpxor \T3, \T6, \T6 2083 2084 vpclmulqdq $0x10, \T5, \T1, \T3 2085 vpxor \T3, \T6, \T6 2086 2087 vmovdqu 16*7(arg1), \T1 2088 vaesenc \T1, \XMM1, \XMM1 2089 vaesenc \T1, \XMM2, \XMM2 2090 vaesenc \T1, \XMM3, \XMM3 2091 vaesenc \T1, \XMM4, \XMM4 2092 vaesenc \T1, \XMM5, \XMM5 2093 vaesenc \T1, \XMM6, \XMM6 2094 vaesenc \T1, \XMM7, \XMM7 2095 vaesenc \T1, \XMM8, \XMM8 2096 2097 vmovdqa TMP6(%rsp), \T1 2098 vmovdqa HashKey_3(arg1), \T5 2099 vpclmulqdq $0x11, \T5, \T1, \T3 2100 vpxor \T3, \T4, \T4 2101 2102 vpclmulqdq $0x00, \T5, \T1, \T3 2103 vpxor \T3, \T7, \T7 2104 2105 vpclmulqdq $0x01, \T5, \T1, \T3 2106 vpxor \T3, \T6, \T6 2107 2108 vpclmulqdq $0x10, \T5, \T1, \T3 2109 vpxor \T3, \T6, \T6 2110 2111 vmovdqu 16*8(arg1), \T1 2112 vaesenc \T1, \XMM1, \XMM1 2113 vaesenc \T1, \XMM2, \XMM2 2114 vaesenc \T1, \XMM3, \XMM3 2115 vaesenc \T1, \XMM4, \XMM4 2116 vaesenc \T1, \XMM5, \XMM5 2117 vaesenc \T1, \XMM6, \XMM6 2118 vaesenc \T1, \XMM7, \XMM7 2119 vaesenc \T1, \XMM8, \XMM8 2120 2121 vmovdqa TMP7(%rsp), \T1 2122 vmovdqa HashKey_2(arg1), \T5 2123 vpclmulqdq $0x11, \T5, \T1, \T3 2124 vpxor \T3, \T4, \T4 2125 2126 vpclmulqdq $0x00, \T5, \T1, \T3 2127 vpxor \T3, \T7, \T7 2128 2129 vpclmulqdq $0x01, \T5, \T1, \T3 2130 vpxor \T3, \T6, \T6 2131 2132 vpclmulqdq $0x10, \T5, \T1, \T3 2133 vpxor \T3, \T6, \T6 2134 2135 2136 ####################################################################### 2137 2138 vmovdqu 16*9(arg1), \T5 2139 vaesenc \T5, \XMM1, \XMM1 2140 vaesenc \T5, \XMM2, \XMM2 2141 vaesenc \T5, \XMM3, \XMM3 2142 vaesenc \T5, \XMM4, \XMM4 2143 vaesenc \T5, \XMM5, \XMM5 2144 vaesenc \T5, \XMM6, \XMM6 2145 vaesenc \T5, \XMM7, \XMM7 2146 vaesenc \T5, \XMM8, \XMM8 2147 2148 vmovdqa TMP8(%rsp), \T1 2149 vmovdqa HashKey(arg1), \T5 2150 2151 vpclmulqdq $0x00, \T5, \T1, \T3 2152 vpxor \T3, \T7, \T7 2153 2154 vpclmulqdq $0x01, \T5, \T1, \T3 2155 vpxor \T3, \T6, \T6 2156 2157 vpclmulqdq $0x10, \T5, \T1, \T3 2158 vpxor \T3, \T6, \T6 2159 2160 vpclmulqdq $0x11, \T5, \T1, \T3 2161 vpxor \T3, \T4, \T1 2162 2163 2164 vmovdqu 16*10(arg1), \T5 2165 2166 i = 0 2167 j = 1 2168 setreg 2169.rep 8 2170 vpxor 16*i(arg3, %r11), \T5, \T2 2171 .if \ENC_DEC == ENC 2172 vaesenclast \T2, reg_j, reg_j 2173 .else 2174 vaesenclast \T2, reg_j, \T3 2175 vmovdqu 16*i(arg3, %r11), reg_j 2176 vmovdqu \T3, 16*i(arg2, %r11) 2177 .endif 2178 i = (i+1) 2179 j = (j+1) 2180 setreg 2181.endr 2182 ####################################################################### 2183 2184 2185 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2186 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2187 vpxor \T3, \T7, \T7 2188 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2189 2190 2191 2192 ####################################################################### 2193 #first phase of the reduction 2194 vmovdqa POLY2(%rip), \T3 2195 2196 vpclmulqdq $0x01, \T7, \T3, \T2 2197 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2198 2199 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2200 ####################################################################### 2201 .if \ENC_DEC == ENC 2202 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer 2203 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer 2204 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer 2205 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer 2206 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer 2207 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer 2208 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer 2209 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer 2210 .endif 2211 2212 ####################################################################### 2213 #second phase of the reduction 2214 vpclmulqdq $0x00, \T7, \T3, \T2 2215 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2216 2217 vpclmulqdq $0x10, \T7, \T3, \T4 2218 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2219 2220 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2221 ####################################################################### 2222 vpxor \T4, \T1, \T1 # the result is in T1 2223 2224 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2225 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2226 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2227 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2228 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2229 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2230 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2231 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2232 2233 2234 vpxor \T1, \XMM1, \XMM1 2235 2236 2237 2238.endm 2239 2240 2241# GHASH the last 4 ciphertext blocks. 2242.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2243 2244 ## Karatsuba Method 2245 2246 vmovdqa HashKey_8(arg1), \T5 2247 2248 vpshufd $0b01001110, \XMM1, \T2 2249 vpshufd $0b01001110, \T5, \T3 2250 vpxor \XMM1, \T2, \T2 2251 vpxor \T5, \T3, \T3 2252 2253 vpclmulqdq $0x11, \T5, \XMM1, \T6 2254 vpclmulqdq $0x00, \T5, \XMM1, \T7 2255 2256 vpclmulqdq $0x00, \T3, \T2, \XMM1 2257 2258 ###################### 2259 2260 vmovdqa HashKey_7(arg1), \T5 2261 vpshufd $0b01001110, \XMM2, \T2 2262 vpshufd $0b01001110, \T5, \T3 2263 vpxor \XMM2, \T2, \T2 2264 vpxor \T5, \T3, \T3 2265 2266 vpclmulqdq $0x11, \T5, \XMM2, \T4 2267 vpxor \T4, \T6, \T6 2268 2269 vpclmulqdq $0x00, \T5, \XMM2, \T4 2270 vpxor \T4, \T7, \T7 2271 2272 vpclmulqdq $0x00, \T3, \T2, \T2 2273 2274 vpxor \T2, \XMM1, \XMM1 2275 2276 ###################### 2277 2278 vmovdqa HashKey_6(arg1), \T5 2279 vpshufd $0b01001110, \XMM3, \T2 2280 vpshufd $0b01001110, \T5, \T3 2281 vpxor \XMM3, \T2, \T2 2282 vpxor \T5, \T3, \T3 2283 2284 vpclmulqdq $0x11, \T5, \XMM3, \T4 2285 vpxor \T4, \T6, \T6 2286 2287 vpclmulqdq $0x00, \T5, \XMM3, \T4 2288 vpxor \T4, \T7, \T7 2289 2290 vpclmulqdq $0x00, \T3, \T2, \T2 2291 2292 vpxor \T2, \XMM1, \XMM1 2293 2294 ###################### 2295 2296 vmovdqa HashKey_5(arg1), \T5 2297 vpshufd $0b01001110, \XMM4, \T2 2298 vpshufd $0b01001110, \T5, \T3 2299 vpxor \XMM4, \T2, \T2 2300 vpxor \T5, \T3, \T3 2301 2302 vpclmulqdq $0x11, \T5, \XMM4, \T4 2303 vpxor \T4, \T6, \T6 2304 2305 vpclmulqdq $0x00, \T5, \XMM4, \T4 2306 vpxor \T4, \T7, \T7 2307 2308 vpclmulqdq $0x00, \T3, \T2, \T2 2309 2310 vpxor \T2, \XMM1, \XMM1 2311 2312 ###################### 2313 2314 vmovdqa HashKey_4(arg1), \T5 2315 vpshufd $0b01001110, \XMM5, \T2 2316 vpshufd $0b01001110, \T5, \T3 2317 vpxor \XMM5, \T2, \T2 2318 vpxor \T5, \T3, \T3 2319 2320 vpclmulqdq $0x11, \T5, \XMM5, \T4 2321 vpxor \T4, \T6, \T6 2322 2323 vpclmulqdq $0x00, \T5, \XMM5, \T4 2324 vpxor \T4, \T7, \T7 2325 2326 vpclmulqdq $0x00, \T3, \T2, \T2 2327 2328 vpxor \T2, \XMM1, \XMM1 2329 2330 ###################### 2331 2332 vmovdqa HashKey_3(arg1), \T5 2333 vpshufd $0b01001110, \XMM6, \T2 2334 vpshufd $0b01001110, \T5, \T3 2335 vpxor \XMM6, \T2, \T2 2336 vpxor \T5, \T3, \T3 2337 2338 vpclmulqdq $0x11, \T5, \XMM6, \T4 2339 vpxor \T4, \T6, \T6 2340 2341 vpclmulqdq $0x00, \T5, \XMM6, \T4 2342 vpxor \T4, \T7, \T7 2343 2344 vpclmulqdq $0x00, \T3, \T2, \T2 2345 2346 vpxor \T2, \XMM1, \XMM1 2347 2348 ###################### 2349 2350 vmovdqa HashKey_2(arg1), \T5 2351 vpshufd $0b01001110, \XMM7, \T2 2352 vpshufd $0b01001110, \T5, \T3 2353 vpxor \XMM7, \T2, \T2 2354 vpxor \T5, \T3, \T3 2355 2356 vpclmulqdq $0x11, \T5, \XMM7, \T4 2357 vpxor \T4, \T6, \T6 2358 2359 vpclmulqdq $0x00, \T5, \XMM7, \T4 2360 vpxor \T4, \T7, \T7 2361 2362 vpclmulqdq $0x00, \T3, \T2, \T2 2363 2364 vpxor \T2, \XMM1, \XMM1 2365 2366 ###################### 2367 2368 vmovdqa HashKey(arg1), \T5 2369 vpshufd $0b01001110, \XMM8, \T2 2370 vpshufd $0b01001110, \T5, \T3 2371 vpxor \XMM8, \T2, \T2 2372 vpxor \T5, \T3, \T3 2373 2374 vpclmulqdq $0x11, \T5, \XMM8, \T4 2375 vpxor \T4, \T6, \T6 2376 2377 vpclmulqdq $0x00, \T5, \XMM8, \T4 2378 vpxor \T4, \T7, \T7 2379 2380 vpclmulqdq $0x00, \T3, \T2, \T2 2381 2382 vpxor \T2, \XMM1, \XMM1 2383 vpxor \T6, \XMM1, \XMM1 2384 vpxor \T7, \XMM1, \T2 2385 2386 2387 2388 2389 vpslldq $8, \T2, \T4 2390 vpsrldq $8, \T2, \T2 2391 2392 vpxor \T4, \T7, \T7 2393 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2394 # accumulated carry-less multiplications 2395 2396 ####################################################################### 2397 #first phase of the reduction 2398 vmovdqa POLY2(%rip), \T3 2399 2400 vpclmulqdq $0x01, \T7, \T3, \T2 2401 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2402 2403 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2404 ####################################################################### 2405 2406 2407 #second phase of the reduction 2408 vpclmulqdq $0x00, \T7, \T3, \T2 2409 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2410 2411 vpclmulqdq $0x10, \T7, \T3, \T4 2412 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2413 2414 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2415 ####################################################################### 2416 vpxor \T4, \T6, \T6 # the result is in T6 2417.endm 2418 2419 2420 2421# combined for GCM encrypt and decrypt functions 2422# clobbering all xmm registers 2423# clobbering r10, r11, r12, r13, r14, r15 2424.macro GCM_ENC_DEC_AVX2 ENC_DEC 2425 2426 #the number of pushes must equal STACK_OFFSET 2427 push %r12 2428 push %r13 2429 push %r14 2430 push %r15 2431 2432 mov %rsp, %r14 2433 2434 2435 2436 2437 sub $VARIABLE_OFFSET, %rsp 2438 and $~63, %rsp # align rsp to 64 bytes 2439 2440 2441 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey 2442 2443 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext 2444 and $-16, %r13 # r13 = r13 - (r13 mod 16) 2445 2446 mov %r13, %r12 2447 shr $4, %r12 2448 and $7, %r12 2449 jz _initial_num_blocks_is_0\@ 2450 2451 cmp $7, %r12 2452 je _initial_num_blocks_is_7\@ 2453 cmp $6, %r12 2454 je _initial_num_blocks_is_6\@ 2455 cmp $5, %r12 2456 je _initial_num_blocks_is_5\@ 2457 cmp $4, %r12 2458 je _initial_num_blocks_is_4\@ 2459 cmp $3, %r12 2460 je _initial_num_blocks_is_3\@ 2461 cmp $2, %r12 2462 je _initial_num_blocks_is_2\@ 2463 2464 jmp _initial_num_blocks_is_1\@ 2465 2466_initial_num_blocks_is_7\@: 2467 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2468 sub $16*7, %r13 2469 jmp _initial_blocks_encrypted\@ 2470 2471_initial_num_blocks_is_6\@: 2472 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2473 sub $16*6, %r13 2474 jmp _initial_blocks_encrypted\@ 2475 2476_initial_num_blocks_is_5\@: 2477 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2478 sub $16*5, %r13 2479 jmp _initial_blocks_encrypted\@ 2480 2481_initial_num_blocks_is_4\@: 2482 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2483 sub $16*4, %r13 2484 jmp _initial_blocks_encrypted\@ 2485 2486_initial_num_blocks_is_3\@: 2487 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2488 sub $16*3, %r13 2489 jmp _initial_blocks_encrypted\@ 2490 2491_initial_num_blocks_is_2\@: 2492 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2493 sub $16*2, %r13 2494 jmp _initial_blocks_encrypted\@ 2495 2496_initial_num_blocks_is_1\@: 2497 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2498 sub $16*1, %r13 2499 jmp _initial_blocks_encrypted\@ 2500 2501_initial_num_blocks_is_0\@: 2502 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2503 2504 2505_initial_blocks_encrypted\@: 2506 cmp $0, %r13 2507 je _zero_cipher_left\@ 2508 2509 sub $128, %r13 2510 je _eight_cipher_left\@ 2511 2512 2513 2514 2515 vmovd %xmm9, %r15d 2516 and $255, %r15d 2517 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2518 2519 2520_encrypt_by_8_new\@: 2521 cmp $(255-8), %r15d 2522 jg _encrypt_by_8\@ 2523 2524 2525 2526 add $8, %r15b 2527 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 2528 add $128, %r11 2529 sub $128, %r13 2530 jne _encrypt_by_8_new\@ 2531 2532 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2533 jmp _eight_cipher_left\@ 2534 2535_encrypt_by_8\@: 2536 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2537 add $8, %r15b 2538 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 2539 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2540 add $128, %r11 2541 sub $128, %r13 2542 jne _encrypt_by_8_new\@ 2543 2544 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2545 2546 2547 2548 2549_eight_cipher_left\@: 2550 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 2551 2552 2553_zero_cipher_left\@: 2554 cmp $16, arg4 2555 jl _only_less_than_16\@ 2556 2557 mov arg4, %r13 2558 and $15, %r13 # r13 = (arg4 mod 16) 2559 2560 je _multiple_of_16_bytes\@ 2561 2562 # handle the last <16 Byte block seperately 2563 2564 2565 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 2566 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2567 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 2568 2569 sub $16, %r11 2570 add %r13, %r11 2571 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block 2572 2573 lea SHIFT_MASK+16(%rip), %r12 2574 sub %r13, %r12 # adjust the shuffle mask pointer 2575 # to be able to shift 16-r13 bytes 2576 # (r13 is the number of bytes in plaintext mod 16) 2577 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 2578 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes 2579 jmp _final_ghash_mul\@ 2580 2581_only_less_than_16\@: 2582 # check for 0 length 2583 mov arg4, %r13 2584 and $15, %r13 # r13 = (arg4 mod 16) 2585 2586 je _multiple_of_16_bytes\@ 2587 2588 # handle the last <16 Byte block seperately 2589 2590 2591 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 2592 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2593 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 2594 2595 2596 lea SHIFT_MASK+16(%rip), %r12 2597 sub %r13, %r12 # adjust the shuffle mask pointer to be 2598 # able to shift 16-r13 bytes (r13 is the 2599 # number of bytes in plaintext mod 16) 2600 2601_get_last_16_byte_loop\@: 2602 movb (arg3, %r11), %al 2603 movb %al, TMP1 (%rsp , %r11) 2604 add $1, %r11 2605 cmp %r13, %r11 2606 jne _get_last_16_byte_loop\@ 2607 2608 vmovdqu TMP1(%rsp), %xmm1 2609 2610 sub $16, %r11 2611 2612_final_ghash_mul\@: 2613 .if \ENC_DEC == DEC 2614 vmovdqa %xmm1, %xmm2 2615 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 2616 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 2617 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 2618 vpand %xmm1, %xmm2, %xmm2 2619 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 2620 vpxor %xmm2, %xmm14, %xmm14 2621 #GHASH computation for the last <16 Byte block 2622 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 2623 sub %r13, %r11 2624 add $16, %r11 2625 .else 2626 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 2627 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 2628 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 2629 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2630 vpxor %xmm9, %xmm14, %xmm14 2631 #GHASH computation for the last <16 Byte block 2632 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 2633 sub %r13, %r11 2634 add $16, %r11 2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 2636 .endif 2637 2638 2639 ############################# 2640 # output r13 Bytes 2641 vmovq %xmm9, %rax 2642 cmp $8, %r13 2643 jle _less_than_8_bytes_left\@ 2644 2645 mov %rax, (arg2 , %r11) 2646 add $8, %r11 2647 vpsrldq $8, %xmm9, %xmm9 2648 vmovq %xmm9, %rax 2649 sub $8, %r13 2650 2651_less_than_8_bytes_left\@: 2652 movb %al, (arg2 , %r11) 2653 add $1, %r11 2654 shr $8, %rax 2655 sub $1, %r13 2656 jne _less_than_8_bytes_left\@ 2657 ############################# 2658 2659_multiple_of_16_bytes\@: 2660 mov arg7, %r12 # r12 = aadLen (number of bytes) 2661 shl $3, %r12 # convert into number of bits 2662 vmovd %r12d, %xmm15 # len(A) in xmm15 2663 2664 shl $3, arg4 # len(C) in bits (*128) 2665 vmovq arg4, %xmm1 2666 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 2667 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 2668 2669 vpxor %xmm15, %xmm14, %xmm14 2670 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 2671 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 2672 2673 mov arg5, %rax # rax = *Y0 2674 vmovdqu (%rax), %xmm9 # xmm9 = Y0 2675 2676 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) 2677 2678 vpxor %xmm14, %xmm9, %xmm9 2679 2680 2681 2682_return_T\@: 2683 mov arg8, %r10 # r10 = authTag 2684 mov arg9, %r11 # r11 = auth_tag_len 2685 2686 cmp $16, %r11 2687 je _T_16\@ 2688 2689 cmp $12, %r11 2690 je _T_12\@ 2691 2692_T_8\@: 2693 vmovq %xmm9, %rax 2694 mov %rax, (%r10) 2695 jmp _return_T_done\@ 2696_T_12\@: 2697 vmovq %xmm9, %rax 2698 mov %rax, (%r10) 2699 vpsrldq $8, %xmm9, %xmm9 2700 vmovd %xmm9, %eax 2701 mov %eax, 8(%r10) 2702 jmp _return_T_done\@ 2703 2704_T_16\@: 2705 vmovdqu %xmm9, (%r10) 2706 2707_return_T_done\@: 2708 mov %r14, %rsp 2709 2710 pop %r15 2711 pop %r14 2712 pop %r13 2713 pop %r12 2714.endm 2715 2716 2717############################################################# 2718#void aesni_gcm_precomp_avx_gen4 2719# (gcm_data *my_ctx_data, 2720# u8 *hash_subkey)# /* H, the Hash sub key input. 2721# Data starts on a 16-byte boundary. */ 2722############################################################# 2723ENTRY(aesni_gcm_precomp_avx_gen4) 2724 #the number of pushes must equal STACK_OFFSET 2725 push %r12 2726 push %r13 2727 push %r14 2728 push %r15 2729 2730 mov %rsp, %r14 2731 2732 2733 2734 sub $VARIABLE_OFFSET, %rsp 2735 and $~63, %rsp # align rsp to 64 bytes 2736 2737 vmovdqu (arg2), %xmm6 # xmm6 = HashKey 2738 2739 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 2740 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 2741 vmovdqa %xmm6, %xmm2 2742 vpsllq $1, %xmm6, %xmm6 2743 vpsrlq $63, %xmm2, %xmm2 2744 vmovdqa %xmm2, %xmm1 2745 vpslldq $8, %xmm2, %xmm2 2746 vpsrldq $8, %xmm1, %xmm1 2747 vpor %xmm2, %xmm6, %xmm6 2748 #reduction 2749 vpshufd $0b00100100, %xmm1, %xmm2 2750 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 2751 vpand POLY(%rip), %xmm2, %xmm2 2752 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 2753 ####################################################################### 2754 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly 2755 2756 2757 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 2758 2759 mov %r14, %rsp 2760 2761 pop %r15 2762 pop %r14 2763 pop %r13 2764 pop %r12 2765 ret 2766ENDPROC(aesni_gcm_precomp_avx_gen4) 2767 2768 2769############################################################################### 2770#void aesni_gcm_enc_avx_gen4( 2771# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2772# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2773# const u8 *in, /* Plaintext input */ 2774# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 2775# u8 *iv, /* Pre-counter block j0: 4 byte salt 2776# (from Security Association) concatenated with 8 byte 2777# Initialisation Vector (from IPSec ESP Payload) 2778# concatenated with 0x00000001. 16-byte aligned pointer. */ 2779# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2780# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2781# u8 *auth_tag, /* Authenticated Tag output. */ 2782# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2783# Valid values are 16 (most likely), 12 or 8. */ 2784############################################################################### 2785ENTRY(aesni_gcm_enc_avx_gen4) 2786 GCM_ENC_DEC_AVX2 ENC 2787 ret 2788ENDPROC(aesni_gcm_enc_avx_gen4) 2789 2790############################################################################### 2791#void aesni_gcm_dec_avx_gen4( 2792# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2793# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2794# const u8 *in, /* Ciphertext input */ 2795# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 2796# u8 *iv, /* Pre-counter block j0: 4 byte salt 2797# (from Security Association) concatenated with 8 byte 2798# Initialisation Vector (from IPSec ESP Payload) 2799# concatenated with 0x00000001. 16-byte aligned pointer. */ 2800# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2801# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2802# u8 *auth_tag, /* Authenticated Tag output. */ 2803# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2804# Valid values are 16 (most likely), 12 or 8. */ 2805############################################################################### 2806ENTRY(aesni_gcm_dec_avx_gen4) 2807 GCM_ENC_DEC_AVX2 DEC 2808 ret 2809ENDPROC(aesni_gcm_dec_avx_gen4) 2810 2811#endif /* CONFIG_AS_AVX2 */ 2812