1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123#include <asm/inst.h> 124 125# constants in mergeable sections, linker can reorder and merge 126.section .rodata.cst16.POLY, "aM", @progbits, 16 127.align 16 128POLY: .octa 0xC2000000000000000000000000000001 129 130.section .rodata.cst16.POLY2, "aM", @progbits, 16 131.align 16 132POLY2: .octa 0xC20000000000000000000001C2000000 133 134.section .rodata.cst16.TWOONE, "aM", @progbits, 16 135.align 16 136TWOONE: .octa 0x00000001000000000000000000000001 137 138.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 139.align 16 140SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 141 142.section .rodata.cst16.ONE, "aM", @progbits, 16 143.align 16 144ONE: .octa 0x00000000000000000000000000000001 145 146.section .rodata.cst16.ONEf, "aM", @progbits, 16 147.align 16 148ONEf: .octa 0x01000000000000000000000000000000 149 150# order of these constants should not change. 151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 152.section .rodata, "a", @progbits 153.align 16 154SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 155ALL_F: .octa 0xffffffffffffffffffffffffffffffff 156 .octa 0x00000000000000000000000000000000 157 158.text 159 160 161##define the fields of the gcm aes context 162#{ 163# u8 expanded_keys[16*11] store expanded keys 164# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here 165# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here 166# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here 167# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here 168# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here 169# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here 170# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here 171# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here 172# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) 173# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) 174# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) 175# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) 176# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) 177# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) 178# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) 179# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) 180#} gcm_ctx# 181 182HashKey = 16*11 # store HashKey <<1 mod poly here 183HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here 184HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here 185HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here 186HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here 187HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here 188HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here 189HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here 190HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 191HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 192HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 193HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 194HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 195HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 196HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 197HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 198 199#define arg1 %rdi 200#define arg2 %rsi 201#define arg3 %rdx 202#define arg4 %rcx 203#define arg5 %r8 204#define arg6 %r9 205#define arg7 STACK_OFFSET+8*1(%r14) 206#define arg8 STACK_OFFSET+8*2(%r14) 207#define arg9 STACK_OFFSET+8*3(%r14) 208 209i = 0 210j = 0 211 212out_order = 0 213in_order = 1 214DEC = 0 215ENC = 1 216 217.macro define_reg r n 218reg_\r = %xmm\n 219.endm 220 221.macro setreg 222.altmacro 223define_reg i %i 224define_reg j %j 225.noaltmacro 226.endm 227 228# need to push 4 registers into stack to maintain 229STACK_OFFSET = 8*4 230 231TMP1 = 16*0 # Temporary storage for AAD 232TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 233TMP3 = 16*2 # Temporary storage for AES State 3 234TMP4 = 16*3 # Temporary storage for AES State 4 235TMP5 = 16*4 # Temporary storage for AES State 5 236TMP6 = 16*5 # Temporary storage for AES State 6 237TMP7 = 16*6 # Temporary storage for AES State 7 238TMP8 = 16*7 # Temporary storage for AES State 8 239 240VARIABLE_OFFSET = 16*8 241 242################################ 243# Utility Macros 244################################ 245 246# Encryption of a single block 247.macro ENCRYPT_SINGLE_BLOCK XMM0 248 vpxor (arg1), \XMM0, \XMM0 249 i = 1 250 setreg 251.rep 9 252 vaesenc 16*i(arg1), \XMM0, \XMM0 253 i = (i+1) 254 setreg 255.endr 256 vaesenclast 16*10(arg1), \XMM0, \XMM0 257.endm 258 259#ifdef CONFIG_AS_AVX 260############################################################################### 261# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 262# Input: A and B (128-bits each, bit-reflected) 263# Output: C = A*B*x mod poly, (i.e. >>1 ) 264# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 265# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 266############################################################################### 267.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 268 269 vpshufd $0b01001110, \GH, \T2 270 vpshufd $0b01001110, \HK, \T3 271 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 272 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 273 274 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 275 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 276 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 277 vpxor \GH, \T2,\T2 278 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 279 280 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 281 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 282 vpxor \T3, \GH, \GH 283 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 284 285 #first phase of the reduction 286 vpslld $31, \GH, \T2 # packed right shifting << 31 287 vpslld $30, \GH, \T3 # packed right shifting shift << 30 288 vpslld $25, \GH, \T4 # packed right shifting shift << 25 289 290 vpxor \T3, \T2, \T2 # xor the shifted versions 291 vpxor \T4, \T2, \T2 292 293 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 294 295 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 296 vpxor \T2, \GH, \GH # first phase of the reduction complete 297 298 #second phase of the reduction 299 300 vpsrld $1,\GH, \T2 # packed left shifting >> 1 301 vpsrld $2,\GH, \T3 # packed left shifting >> 2 302 vpsrld $7,\GH, \T4 # packed left shifting >> 7 303 vpxor \T3, \T2, \T2 # xor the shifted versions 304 vpxor \T4, \T2, \T2 305 306 vpxor \T5, \T2, \T2 307 vpxor \T2, \GH, \GH 308 vpxor \T1, \GH, \GH # the result is in GH 309 310 311.endm 312 313.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 314 315 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 316 vmovdqa \HK, \T5 317 318 vpshufd $0b01001110, \T5, \T1 319 vpxor \T5, \T1, \T1 320 vmovdqa \T1, HashKey_k(arg1) 321 322 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 323 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly 324 vpshufd $0b01001110, \T5, \T1 325 vpxor \T5, \T1, \T1 326 vmovdqa \T1, HashKey_2_k(arg1) 327 328 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 329 vmovdqa \T5, HashKey_3(arg1) 330 vpshufd $0b01001110, \T5, \T1 331 vpxor \T5, \T1, \T1 332 vmovdqa \T1, HashKey_3_k(arg1) 333 334 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 335 vmovdqa \T5, HashKey_4(arg1) 336 vpshufd $0b01001110, \T5, \T1 337 vpxor \T5, \T1, \T1 338 vmovdqa \T1, HashKey_4_k(arg1) 339 340 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 341 vmovdqa \T5, HashKey_5(arg1) 342 vpshufd $0b01001110, \T5, \T1 343 vpxor \T5, \T1, \T1 344 vmovdqa \T1, HashKey_5_k(arg1) 345 346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 347 vmovdqa \T5, HashKey_6(arg1) 348 vpshufd $0b01001110, \T5, \T1 349 vpxor \T5, \T1, \T1 350 vmovdqa \T1, HashKey_6_k(arg1) 351 352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 353 vmovdqa \T5, HashKey_7(arg1) 354 vpshufd $0b01001110, \T5, \T1 355 vpxor \T5, \T1, \T1 356 vmovdqa \T1, HashKey_7_k(arg1) 357 358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 359 vmovdqa \T5, HashKey_8(arg1) 360 vpshufd $0b01001110, \T5, \T1 361 vpxor \T5, \T1, \T1 362 vmovdqa \T1, HashKey_8_k(arg1) 363 364.endm 365 366## if a = number of total plaintext bytes 367## b = floor(a/16) 368## num_initial_blocks = b mod 4# 369## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 370## r10, r11, r12, rax are clobbered 371## arg1, arg2, arg3, r14 are used as a pointer only, not modified 372 373.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 374 i = (8-\num_initial_blocks) 375 setreg 376 377 mov arg6, %r10 # r10 = AAD 378 mov arg7, %r12 # r12 = aadLen 379 380 381 mov %r12, %r11 382 383 vpxor reg_i, reg_i, reg_i 384_get_AAD_loop\@: 385 vmovd (%r10), \T1 386 vpslldq $12, \T1, \T1 387 vpsrldq $4, reg_i, reg_i 388 vpxor \T1, reg_i, reg_i 389 390 add $4, %r10 391 sub $4, %r12 392 jg _get_AAD_loop\@ 393 394 395 cmp $16, %r11 396 je _get_AAD_loop2_done\@ 397 mov $16, %r12 398 399_get_AAD_loop2\@: 400 vpsrldq $4, reg_i, reg_i 401 sub $4, %r12 402 cmp %r11, %r12 403 jg _get_AAD_loop2\@ 404 405_get_AAD_loop2_done\@: 406 407 #byte-reflect the AAD data 408 vpshufb SHUF_MASK(%rip), reg_i, reg_i 409 410 # initialize the data pointer offset as zero 411 xor %r11, %r11 412 413 # start AES for num_initial_blocks blocks 414 mov arg5, %rax # rax = *Y0 415 vmovdqu (%rax), \CTR # CTR = Y0 416 vpshufb SHUF_MASK(%rip), \CTR, \CTR 417 418 419 i = (9-\num_initial_blocks) 420 setreg 421.rep \num_initial_blocks 422 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 423 vmovdqa \CTR, reg_i 424 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 425 i = (i+1) 426 setreg 427.endr 428 429 vmovdqa (arg1), \T_key 430 i = (9-\num_initial_blocks) 431 setreg 432.rep \num_initial_blocks 433 vpxor \T_key, reg_i, reg_i 434 i = (i+1) 435 setreg 436.endr 437 438 j = 1 439 setreg 440.rep 9 441 vmovdqa 16*j(arg1), \T_key 442 i = (9-\num_initial_blocks) 443 setreg 444.rep \num_initial_blocks 445 vaesenc \T_key, reg_i, reg_i 446 i = (i+1) 447 setreg 448.endr 449 450 j = (j+1) 451 setreg 452.endr 453 454 455 vmovdqa 16*10(arg1), \T_key 456 i = (9-\num_initial_blocks) 457 setreg 458.rep \num_initial_blocks 459 vaesenclast \T_key, reg_i, reg_i 460 i = (i+1) 461 setreg 462.endr 463 464 i = (9-\num_initial_blocks) 465 setreg 466.rep \num_initial_blocks 467 vmovdqu (arg3, %r11), \T1 468 vpxor \T1, reg_i, reg_i 469 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks 470 add $16, %r11 471.if \ENC_DEC == DEC 472 vmovdqa \T1, reg_i 473.endif 474 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 475 i = (i+1) 476 setreg 477.endr 478 479 480 i = (8-\num_initial_blocks) 481 j = (9-\num_initial_blocks) 482 setreg 483 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 484 485.rep \num_initial_blocks 486 vpxor reg_i, reg_j, reg_j 487 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 488 i = (i+1) 489 j = (j+1) 490 setreg 491.endr 492 # XMM8 has the combined result here 493 494 vmovdqa \XMM8, TMP1(%rsp) 495 vmovdqa \XMM8, \T3 496 497 cmp $128, %r13 498 jl _initial_blocks_done\@ # no need for precomputed constants 499 500############################################################################### 501# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 503 vmovdqa \CTR, \XMM1 504 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 505 506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 507 vmovdqa \CTR, \XMM2 508 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 509 510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 511 vmovdqa \CTR, \XMM3 512 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 513 514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 515 vmovdqa \CTR, \XMM4 516 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 517 518 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 519 vmovdqa \CTR, \XMM5 520 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 521 522 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 523 vmovdqa \CTR, \XMM6 524 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 525 526 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 527 vmovdqa \CTR, \XMM7 528 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 529 530 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 531 vmovdqa \CTR, \XMM8 532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 533 534 vmovdqa (arg1), \T_key 535 vpxor \T_key, \XMM1, \XMM1 536 vpxor \T_key, \XMM2, \XMM2 537 vpxor \T_key, \XMM3, \XMM3 538 vpxor \T_key, \XMM4, \XMM4 539 vpxor \T_key, \XMM5, \XMM5 540 vpxor \T_key, \XMM6, \XMM6 541 vpxor \T_key, \XMM7, \XMM7 542 vpxor \T_key, \XMM8, \XMM8 543 544 i = 1 545 setreg 546.rep 9 # do 9 rounds 547 vmovdqa 16*i(arg1), \T_key 548 vaesenc \T_key, \XMM1, \XMM1 549 vaesenc \T_key, \XMM2, \XMM2 550 vaesenc \T_key, \XMM3, \XMM3 551 vaesenc \T_key, \XMM4, \XMM4 552 vaesenc \T_key, \XMM5, \XMM5 553 vaesenc \T_key, \XMM6, \XMM6 554 vaesenc \T_key, \XMM7, \XMM7 555 vaesenc \T_key, \XMM8, \XMM8 556 i = (i+1) 557 setreg 558.endr 559 560 561 vmovdqa 16*i(arg1), \T_key 562 vaesenclast \T_key, \XMM1, \XMM1 563 vaesenclast \T_key, \XMM2, \XMM2 564 vaesenclast \T_key, \XMM3, \XMM3 565 vaesenclast \T_key, \XMM4, \XMM4 566 vaesenclast \T_key, \XMM5, \XMM5 567 vaesenclast \T_key, \XMM6, \XMM6 568 vaesenclast \T_key, \XMM7, \XMM7 569 vaesenclast \T_key, \XMM8, \XMM8 570 571 vmovdqu (arg3, %r11), \T1 572 vpxor \T1, \XMM1, \XMM1 573 vmovdqu \XMM1, (arg2 , %r11) 574 .if \ENC_DEC == DEC 575 vmovdqa \T1, \XMM1 576 .endif 577 578 vmovdqu 16*1(arg3, %r11), \T1 579 vpxor \T1, \XMM2, \XMM2 580 vmovdqu \XMM2, 16*1(arg2 , %r11) 581 .if \ENC_DEC == DEC 582 vmovdqa \T1, \XMM2 583 .endif 584 585 vmovdqu 16*2(arg3, %r11), \T1 586 vpxor \T1, \XMM3, \XMM3 587 vmovdqu \XMM3, 16*2(arg2 , %r11) 588 .if \ENC_DEC == DEC 589 vmovdqa \T1, \XMM3 590 .endif 591 592 vmovdqu 16*3(arg3, %r11), \T1 593 vpxor \T1, \XMM4, \XMM4 594 vmovdqu \XMM4, 16*3(arg2 , %r11) 595 .if \ENC_DEC == DEC 596 vmovdqa \T1, \XMM4 597 .endif 598 599 vmovdqu 16*4(arg3, %r11), \T1 600 vpxor \T1, \XMM5, \XMM5 601 vmovdqu \XMM5, 16*4(arg2 , %r11) 602 .if \ENC_DEC == DEC 603 vmovdqa \T1, \XMM5 604 .endif 605 606 vmovdqu 16*5(arg3, %r11), \T1 607 vpxor \T1, \XMM6, \XMM6 608 vmovdqu \XMM6, 16*5(arg2 , %r11) 609 .if \ENC_DEC == DEC 610 vmovdqa \T1, \XMM6 611 .endif 612 613 vmovdqu 16*6(arg3, %r11), \T1 614 vpxor \T1, \XMM7, \XMM7 615 vmovdqu \XMM7, 16*6(arg2 , %r11) 616 .if \ENC_DEC == DEC 617 vmovdqa \T1, \XMM7 618 .endif 619 620 vmovdqu 16*7(arg3, %r11), \T1 621 vpxor \T1, \XMM8, \XMM8 622 vmovdqu \XMM8, 16*7(arg2 , %r11) 623 .if \ENC_DEC == DEC 624 vmovdqa \T1, \XMM8 625 .endif 626 627 add $128, %r11 628 629 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 630 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 631 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 632 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 633 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 634 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 635 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 636 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 637 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 638 639############################################################################### 640 641_initial_blocks_done\@: 642 643.endm 644 645# encrypt 8 blocks at a time 646# ghash the 8 previously encrypted ciphertext blocks 647# arg1, arg2, arg3 are used as pointers only, not modified 648# r11 is the data offset value 649.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 650 651 vmovdqa \XMM1, \T2 652 vmovdqa \XMM2, TMP2(%rsp) 653 vmovdqa \XMM3, TMP3(%rsp) 654 vmovdqa \XMM4, TMP4(%rsp) 655 vmovdqa \XMM5, TMP5(%rsp) 656 vmovdqa \XMM6, TMP6(%rsp) 657 vmovdqa \XMM7, TMP7(%rsp) 658 vmovdqa \XMM8, TMP8(%rsp) 659 660.if \loop_idx == in_order 661 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 662 vpaddd ONE(%rip), \XMM1, \XMM2 663 vpaddd ONE(%rip), \XMM2, \XMM3 664 vpaddd ONE(%rip), \XMM3, \XMM4 665 vpaddd ONE(%rip), \XMM4, \XMM5 666 vpaddd ONE(%rip), \XMM5, \XMM6 667 vpaddd ONE(%rip), \XMM6, \XMM7 668 vpaddd ONE(%rip), \XMM7, \XMM8 669 vmovdqa \XMM8, \CTR 670 671 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 672 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 673 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 674 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 675 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 676 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 677 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 678 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 679.else 680 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 681 vpaddd ONEf(%rip), \XMM1, \XMM2 682 vpaddd ONEf(%rip), \XMM2, \XMM3 683 vpaddd ONEf(%rip), \XMM3, \XMM4 684 vpaddd ONEf(%rip), \XMM4, \XMM5 685 vpaddd ONEf(%rip), \XMM5, \XMM6 686 vpaddd ONEf(%rip), \XMM6, \XMM7 687 vpaddd ONEf(%rip), \XMM7, \XMM8 688 vmovdqa \XMM8, \CTR 689.endif 690 691 692 ####################################################################### 693 694 vmovdqu (arg1), \T1 695 vpxor \T1, \XMM1, \XMM1 696 vpxor \T1, \XMM2, \XMM2 697 vpxor \T1, \XMM3, \XMM3 698 vpxor \T1, \XMM4, \XMM4 699 vpxor \T1, \XMM5, \XMM5 700 vpxor \T1, \XMM6, \XMM6 701 vpxor \T1, \XMM7, \XMM7 702 vpxor \T1, \XMM8, \XMM8 703 704 ####################################################################### 705 706 707 708 709 710 vmovdqu 16*1(arg1), \T1 711 vaesenc \T1, \XMM1, \XMM1 712 vaesenc \T1, \XMM2, \XMM2 713 vaesenc \T1, \XMM3, \XMM3 714 vaesenc \T1, \XMM4, \XMM4 715 vaesenc \T1, \XMM5, \XMM5 716 vaesenc \T1, \XMM6, \XMM6 717 vaesenc \T1, \XMM7, \XMM7 718 vaesenc \T1, \XMM8, \XMM8 719 720 vmovdqu 16*2(arg1), \T1 721 vaesenc \T1, \XMM1, \XMM1 722 vaesenc \T1, \XMM2, \XMM2 723 vaesenc \T1, \XMM3, \XMM3 724 vaesenc \T1, \XMM4, \XMM4 725 vaesenc \T1, \XMM5, \XMM5 726 vaesenc \T1, \XMM6, \XMM6 727 vaesenc \T1, \XMM7, \XMM7 728 vaesenc \T1, \XMM8, \XMM8 729 730 731 ####################################################################### 732 733 vmovdqa HashKey_8(arg1), \T5 734 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 735 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 736 737 vpshufd $0b01001110, \T2, \T6 738 vpxor \T2, \T6, \T6 739 740 vmovdqa HashKey_8_k(arg1), \T5 741 vpclmulqdq $0x00, \T5, \T6, \T6 742 743 vmovdqu 16*3(arg1), \T1 744 vaesenc \T1, \XMM1, \XMM1 745 vaesenc \T1, \XMM2, \XMM2 746 vaesenc \T1, \XMM3, \XMM3 747 vaesenc \T1, \XMM4, \XMM4 748 vaesenc \T1, \XMM5, \XMM5 749 vaesenc \T1, \XMM6, \XMM6 750 vaesenc \T1, \XMM7, \XMM7 751 vaesenc \T1, \XMM8, \XMM8 752 753 vmovdqa TMP2(%rsp), \T1 754 vmovdqa HashKey_7(arg1), \T5 755 vpclmulqdq $0x11, \T5, \T1, \T3 756 vpxor \T3, \T4, \T4 757 vpclmulqdq $0x00, \T5, \T1, \T3 758 vpxor \T3, \T7, \T7 759 760 vpshufd $0b01001110, \T1, \T3 761 vpxor \T1, \T3, \T3 762 vmovdqa HashKey_7_k(arg1), \T5 763 vpclmulqdq $0x10, \T5, \T3, \T3 764 vpxor \T3, \T6, \T6 765 766 vmovdqu 16*4(arg1), \T1 767 vaesenc \T1, \XMM1, \XMM1 768 vaesenc \T1, \XMM2, \XMM2 769 vaesenc \T1, \XMM3, \XMM3 770 vaesenc \T1, \XMM4, \XMM4 771 vaesenc \T1, \XMM5, \XMM5 772 vaesenc \T1, \XMM6, \XMM6 773 vaesenc \T1, \XMM7, \XMM7 774 vaesenc \T1, \XMM8, \XMM8 775 776 ####################################################################### 777 778 vmovdqa TMP3(%rsp), \T1 779 vmovdqa HashKey_6(arg1), \T5 780 vpclmulqdq $0x11, \T5, \T1, \T3 781 vpxor \T3, \T4, \T4 782 vpclmulqdq $0x00, \T5, \T1, \T3 783 vpxor \T3, \T7, \T7 784 785 vpshufd $0b01001110, \T1, \T3 786 vpxor \T1, \T3, \T3 787 vmovdqa HashKey_6_k(arg1), \T5 788 vpclmulqdq $0x10, \T5, \T3, \T3 789 vpxor \T3, \T6, \T6 790 791 vmovdqu 16*5(arg1), \T1 792 vaesenc \T1, \XMM1, \XMM1 793 vaesenc \T1, \XMM2, \XMM2 794 vaesenc \T1, \XMM3, \XMM3 795 vaesenc \T1, \XMM4, \XMM4 796 vaesenc \T1, \XMM5, \XMM5 797 vaesenc \T1, \XMM6, \XMM6 798 vaesenc \T1, \XMM7, \XMM7 799 vaesenc \T1, \XMM8, \XMM8 800 801 vmovdqa TMP4(%rsp), \T1 802 vmovdqa HashKey_5(arg1), \T5 803 vpclmulqdq $0x11, \T5, \T1, \T3 804 vpxor \T3, \T4, \T4 805 vpclmulqdq $0x00, \T5, \T1, \T3 806 vpxor \T3, \T7, \T7 807 808 vpshufd $0b01001110, \T1, \T3 809 vpxor \T1, \T3, \T3 810 vmovdqa HashKey_5_k(arg1), \T5 811 vpclmulqdq $0x10, \T5, \T3, \T3 812 vpxor \T3, \T6, \T6 813 814 vmovdqu 16*6(arg1), \T1 815 vaesenc \T1, \XMM1, \XMM1 816 vaesenc \T1, \XMM2, \XMM2 817 vaesenc \T1, \XMM3, \XMM3 818 vaesenc \T1, \XMM4, \XMM4 819 vaesenc \T1, \XMM5, \XMM5 820 vaesenc \T1, \XMM6, \XMM6 821 vaesenc \T1, \XMM7, \XMM7 822 vaesenc \T1, \XMM8, \XMM8 823 824 825 vmovdqa TMP5(%rsp), \T1 826 vmovdqa HashKey_4(arg1), \T5 827 vpclmulqdq $0x11, \T5, \T1, \T3 828 vpxor \T3, \T4, \T4 829 vpclmulqdq $0x00, \T5, \T1, \T3 830 vpxor \T3, \T7, \T7 831 832 vpshufd $0b01001110, \T1, \T3 833 vpxor \T1, \T3, \T3 834 vmovdqa HashKey_4_k(arg1), \T5 835 vpclmulqdq $0x10, \T5, \T3, \T3 836 vpxor \T3, \T6, \T6 837 838 vmovdqu 16*7(arg1), \T1 839 vaesenc \T1, \XMM1, \XMM1 840 vaesenc \T1, \XMM2, \XMM2 841 vaesenc \T1, \XMM3, \XMM3 842 vaesenc \T1, \XMM4, \XMM4 843 vaesenc \T1, \XMM5, \XMM5 844 vaesenc \T1, \XMM6, \XMM6 845 vaesenc \T1, \XMM7, \XMM7 846 vaesenc \T1, \XMM8, \XMM8 847 848 vmovdqa TMP6(%rsp), \T1 849 vmovdqa HashKey_3(arg1), \T5 850 vpclmulqdq $0x11, \T5, \T1, \T3 851 vpxor \T3, \T4, \T4 852 vpclmulqdq $0x00, \T5, \T1, \T3 853 vpxor \T3, \T7, \T7 854 855 vpshufd $0b01001110, \T1, \T3 856 vpxor \T1, \T3, \T3 857 vmovdqa HashKey_3_k(arg1), \T5 858 vpclmulqdq $0x10, \T5, \T3, \T3 859 vpxor \T3, \T6, \T6 860 861 862 vmovdqu 16*8(arg1), \T1 863 vaesenc \T1, \XMM1, \XMM1 864 vaesenc \T1, \XMM2, \XMM2 865 vaesenc \T1, \XMM3, \XMM3 866 vaesenc \T1, \XMM4, \XMM4 867 vaesenc \T1, \XMM5, \XMM5 868 vaesenc \T1, \XMM6, \XMM6 869 vaesenc \T1, \XMM7, \XMM7 870 vaesenc \T1, \XMM8, \XMM8 871 872 vmovdqa TMP7(%rsp), \T1 873 vmovdqa HashKey_2(arg1), \T5 874 vpclmulqdq $0x11, \T5, \T1, \T3 875 vpxor \T3, \T4, \T4 876 vpclmulqdq $0x00, \T5, \T1, \T3 877 vpxor \T3, \T7, \T7 878 879 vpshufd $0b01001110, \T1, \T3 880 vpxor \T1, \T3, \T3 881 vmovdqa HashKey_2_k(arg1), \T5 882 vpclmulqdq $0x10, \T5, \T3, \T3 883 vpxor \T3, \T6, \T6 884 885 ####################################################################### 886 887 vmovdqu 16*9(arg1), \T5 888 vaesenc \T5, \XMM1, \XMM1 889 vaesenc \T5, \XMM2, \XMM2 890 vaesenc \T5, \XMM3, \XMM3 891 vaesenc \T5, \XMM4, \XMM4 892 vaesenc \T5, \XMM5, \XMM5 893 vaesenc \T5, \XMM6, \XMM6 894 vaesenc \T5, \XMM7, \XMM7 895 vaesenc \T5, \XMM8, \XMM8 896 897 vmovdqa TMP8(%rsp), \T1 898 vmovdqa HashKey(arg1), \T5 899 vpclmulqdq $0x11, \T5, \T1, \T3 900 vpxor \T3, \T4, \T4 901 vpclmulqdq $0x00, \T5, \T1, \T3 902 vpxor \T3, \T7, \T7 903 904 vpshufd $0b01001110, \T1, \T3 905 vpxor \T1, \T3, \T3 906 vmovdqa HashKey_k(arg1), \T5 907 vpclmulqdq $0x10, \T5, \T3, \T3 908 vpxor \T3, \T6, \T6 909 910 vpxor \T4, \T6, \T6 911 vpxor \T7, \T6, \T6 912 913 vmovdqu 16*10(arg1), \T5 914 915 i = 0 916 j = 1 917 setreg 918.rep 8 919 vpxor 16*i(arg3, %r11), \T5, \T2 920 .if \ENC_DEC == ENC 921 vaesenclast \T2, reg_j, reg_j 922 .else 923 vaesenclast \T2, reg_j, \T3 924 vmovdqu 16*i(arg3, %r11), reg_j 925 vmovdqu \T3, 16*i(arg2, %r11) 926 .endif 927 i = (i+1) 928 j = (j+1) 929 setreg 930.endr 931 ####################################################################### 932 933 934 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 935 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 936 vpxor \T3, \T7, \T7 937 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 938 939 940 941 ####################################################################### 942 #first phase of the reduction 943 ####################################################################### 944 vpslld $31, \T7, \T2 # packed right shifting << 31 945 vpslld $30, \T7, \T3 # packed right shifting shift << 30 946 vpslld $25, \T7, \T4 # packed right shifting shift << 25 947 948 vpxor \T3, \T2, \T2 # xor the shifted versions 949 vpxor \T4, \T2, \T2 950 951 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 952 953 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 954 vpxor \T2, \T7, \T7 # first phase of the reduction complete 955 ####################################################################### 956 .if \ENC_DEC == ENC 957 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer 958 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer 959 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer 960 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer 961 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer 962 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer 963 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer 964 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer 965 .endif 966 967 ####################################################################### 968 #second phase of the reduction 969 vpsrld $1, \T7, \T2 # packed left shifting >> 1 970 vpsrld $2, \T7, \T3 # packed left shifting >> 2 971 vpsrld $7, \T7, \T4 # packed left shifting >> 7 972 vpxor \T3, \T2, \T2 # xor the shifted versions 973 vpxor \T4, \T2, \T2 974 975 vpxor \T1, \T2, \T2 976 vpxor \T2, \T7, \T7 977 vpxor \T7, \T6, \T6 # the result is in T6 978 ####################################################################### 979 980 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 981 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 982 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 983 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 984 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 985 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 986 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 987 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 988 989 990 vpxor \T6, \XMM1, \XMM1 991 992 993 994.endm 995 996 997# GHASH the last 4 ciphertext blocks. 998.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 999 1000 ## Karatsuba Method 1001 1002 1003 vpshufd $0b01001110, \XMM1, \T2 1004 vpxor \XMM1, \T2, \T2 1005 vmovdqa HashKey_8(arg1), \T5 1006 vpclmulqdq $0x11, \T5, \XMM1, \T6 1007 vpclmulqdq $0x00, \T5, \XMM1, \T7 1008 1009 vmovdqa HashKey_8_k(arg1), \T3 1010 vpclmulqdq $0x00, \T3, \T2, \XMM1 1011 1012 ###################### 1013 1014 vpshufd $0b01001110, \XMM2, \T2 1015 vpxor \XMM2, \T2, \T2 1016 vmovdqa HashKey_7(arg1), \T5 1017 vpclmulqdq $0x11, \T5, \XMM2, \T4 1018 vpxor \T4, \T6, \T6 1019 1020 vpclmulqdq $0x00, \T5, \XMM2, \T4 1021 vpxor \T4, \T7, \T7 1022 1023 vmovdqa HashKey_7_k(arg1), \T3 1024 vpclmulqdq $0x00, \T3, \T2, \T2 1025 vpxor \T2, \XMM1, \XMM1 1026 1027 ###################### 1028 1029 vpshufd $0b01001110, \XMM3, \T2 1030 vpxor \XMM3, \T2, \T2 1031 vmovdqa HashKey_6(arg1), \T5 1032 vpclmulqdq $0x11, \T5, \XMM3, \T4 1033 vpxor \T4, \T6, \T6 1034 1035 vpclmulqdq $0x00, \T5, \XMM3, \T4 1036 vpxor \T4, \T7, \T7 1037 1038 vmovdqa HashKey_6_k(arg1), \T3 1039 vpclmulqdq $0x00, \T3, \T2, \T2 1040 vpxor \T2, \XMM1, \XMM1 1041 1042 ###################### 1043 1044 vpshufd $0b01001110, \XMM4, \T2 1045 vpxor \XMM4, \T2, \T2 1046 vmovdqa HashKey_5(arg1), \T5 1047 vpclmulqdq $0x11, \T5, \XMM4, \T4 1048 vpxor \T4, \T6, \T6 1049 1050 vpclmulqdq $0x00, \T5, \XMM4, \T4 1051 vpxor \T4, \T7, \T7 1052 1053 vmovdqa HashKey_5_k(arg1), \T3 1054 vpclmulqdq $0x00, \T3, \T2, \T2 1055 vpxor \T2, \XMM1, \XMM1 1056 1057 ###################### 1058 1059 vpshufd $0b01001110, \XMM5, \T2 1060 vpxor \XMM5, \T2, \T2 1061 vmovdqa HashKey_4(arg1), \T5 1062 vpclmulqdq $0x11, \T5, \XMM5, \T4 1063 vpxor \T4, \T6, \T6 1064 1065 vpclmulqdq $0x00, \T5, \XMM5, \T4 1066 vpxor \T4, \T7, \T7 1067 1068 vmovdqa HashKey_4_k(arg1), \T3 1069 vpclmulqdq $0x00, \T3, \T2, \T2 1070 vpxor \T2, \XMM1, \XMM1 1071 1072 ###################### 1073 1074 vpshufd $0b01001110, \XMM6, \T2 1075 vpxor \XMM6, \T2, \T2 1076 vmovdqa HashKey_3(arg1), \T5 1077 vpclmulqdq $0x11, \T5, \XMM6, \T4 1078 vpxor \T4, \T6, \T6 1079 1080 vpclmulqdq $0x00, \T5, \XMM6, \T4 1081 vpxor \T4, \T7, \T7 1082 1083 vmovdqa HashKey_3_k(arg1), \T3 1084 vpclmulqdq $0x00, \T3, \T2, \T2 1085 vpxor \T2, \XMM1, \XMM1 1086 1087 ###################### 1088 1089 vpshufd $0b01001110, \XMM7, \T2 1090 vpxor \XMM7, \T2, \T2 1091 vmovdqa HashKey_2(arg1), \T5 1092 vpclmulqdq $0x11, \T5, \XMM7, \T4 1093 vpxor \T4, \T6, \T6 1094 1095 vpclmulqdq $0x00, \T5, \XMM7, \T4 1096 vpxor \T4, \T7, \T7 1097 1098 vmovdqa HashKey_2_k(arg1), \T3 1099 vpclmulqdq $0x00, \T3, \T2, \T2 1100 vpxor \T2, \XMM1, \XMM1 1101 1102 ###################### 1103 1104 vpshufd $0b01001110, \XMM8, \T2 1105 vpxor \XMM8, \T2, \T2 1106 vmovdqa HashKey(arg1), \T5 1107 vpclmulqdq $0x11, \T5, \XMM8, \T4 1108 vpxor \T4, \T6, \T6 1109 1110 vpclmulqdq $0x00, \T5, \XMM8, \T4 1111 vpxor \T4, \T7, \T7 1112 1113 vmovdqa HashKey_k(arg1), \T3 1114 vpclmulqdq $0x00, \T3, \T2, \T2 1115 1116 vpxor \T2, \XMM1, \XMM1 1117 vpxor \T6, \XMM1, \XMM1 1118 vpxor \T7, \XMM1, \T2 1119 1120 1121 1122 1123 vpslldq $8, \T2, \T4 1124 vpsrldq $8, \T2, \T2 1125 1126 vpxor \T4, \T7, \T7 1127 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1128 # the accumulated carry-less multiplications 1129 1130 ####################################################################### 1131 #first phase of the reduction 1132 vpslld $31, \T7, \T2 # packed right shifting << 31 1133 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1134 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1135 1136 vpxor \T3, \T2, \T2 # xor the shifted versions 1137 vpxor \T4, \T2, \T2 1138 1139 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1140 1141 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1142 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1143 ####################################################################### 1144 1145 1146 #second phase of the reduction 1147 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1148 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1149 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1150 vpxor \T3, \T2, \T2 # xor the shifted versions 1151 vpxor \T4, \T2, \T2 1152 1153 vpxor \T1, \T2, \T2 1154 vpxor \T2, \T7, \T7 1155 vpxor \T7, \T6, \T6 # the result is in T6 1156 1157.endm 1158 1159 1160# combined for GCM encrypt and decrypt functions 1161# clobbering all xmm registers 1162# clobbering r10, r11, r12, r13, r14, r15 1163.macro GCM_ENC_DEC_AVX ENC_DEC 1164 1165 #the number of pushes must equal STACK_OFFSET 1166 push %r12 1167 push %r13 1168 push %r14 1169 push %r15 1170 1171 mov %rsp, %r14 1172 1173 1174 1175 1176 sub $VARIABLE_OFFSET, %rsp 1177 and $~63, %rsp # align rsp to 64 bytes 1178 1179 1180 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey 1181 1182 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext 1183 and $-16, %r13 # r13 = r13 - (r13 mod 16) 1184 1185 mov %r13, %r12 1186 shr $4, %r12 1187 and $7, %r12 1188 jz _initial_num_blocks_is_0\@ 1189 1190 cmp $7, %r12 1191 je _initial_num_blocks_is_7\@ 1192 cmp $6, %r12 1193 je _initial_num_blocks_is_6\@ 1194 cmp $5, %r12 1195 je _initial_num_blocks_is_5\@ 1196 cmp $4, %r12 1197 je _initial_num_blocks_is_4\@ 1198 cmp $3, %r12 1199 je _initial_num_blocks_is_3\@ 1200 cmp $2, %r12 1201 je _initial_num_blocks_is_2\@ 1202 1203 jmp _initial_num_blocks_is_1\@ 1204 1205_initial_num_blocks_is_7\@: 1206 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1207 sub $16*7, %r13 1208 jmp _initial_blocks_encrypted\@ 1209 1210_initial_num_blocks_is_6\@: 1211 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1212 sub $16*6, %r13 1213 jmp _initial_blocks_encrypted\@ 1214 1215_initial_num_blocks_is_5\@: 1216 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1217 sub $16*5, %r13 1218 jmp _initial_blocks_encrypted\@ 1219 1220_initial_num_blocks_is_4\@: 1221 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1222 sub $16*4, %r13 1223 jmp _initial_blocks_encrypted\@ 1224 1225_initial_num_blocks_is_3\@: 1226 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1227 sub $16*3, %r13 1228 jmp _initial_blocks_encrypted\@ 1229 1230_initial_num_blocks_is_2\@: 1231 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1232 sub $16*2, %r13 1233 jmp _initial_blocks_encrypted\@ 1234 1235_initial_num_blocks_is_1\@: 1236 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1237 sub $16*1, %r13 1238 jmp _initial_blocks_encrypted\@ 1239 1240_initial_num_blocks_is_0\@: 1241 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1242 1243 1244_initial_blocks_encrypted\@: 1245 cmp $0, %r13 1246 je _zero_cipher_left\@ 1247 1248 sub $128, %r13 1249 je _eight_cipher_left\@ 1250 1251 1252 1253 1254 vmovd %xmm9, %r15d 1255 and $255, %r15d 1256 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1257 1258 1259_encrypt_by_8_new\@: 1260 cmp $(255-8), %r15d 1261 jg _encrypt_by_8\@ 1262 1263 1264 1265 add $8, %r15b 1266 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 1267 add $128, %r11 1268 sub $128, %r13 1269 jne _encrypt_by_8_new\@ 1270 1271 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1272 jmp _eight_cipher_left\@ 1273 1274_encrypt_by_8\@: 1275 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1276 add $8, %r15b 1277 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 1278 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1279 add $128, %r11 1280 sub $128, %r13 1281 jne _encrypt_by_8_new\@ 1282 1283 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1284 1285 1286 1287 1288_eight_cipher_left\@: 1289 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 1290 1291 1292_zero_cipher_left\@: 1293 cmp $16, arg4 1294 jl _only_less_than_16\@ 1295 1296 mov arg4, %r13 1297 and $15, %r13 # r13 = (arg4 mod 16) 1298 1299 je _multiple_of_16_bytes\@ 1300 1301 # handle the last <16 Byte block seperately 1302 1303 1304 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 1305 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1306 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 1307 1308 sub $16, %r11 1309 add %r13, %r11 1310 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block 1311 1312 lea SHIFT_MASK+16(%rip), %r12 1313 sub %r13, %r12 # adjust the shuffle mask pointer to be 1314 # able to shift 16-r13 bytes (r13 is the 1315 # number of bytes in plaintext mod 16) 1316 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 1317 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes 1318 jmp _final_ghash_mul\@ 1319 1320_only_less_than_16\@: 1321 # check for 0 length 1322 mov arg4, %r13 1323 and $15, %r13 # r13 = (arg4 mod 16) 1324 1325 je _multiple_of_16_bytes\@ 1326 1327 # handle the last <16 Byte block seperately 1328 1329 1330 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 1331 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1332 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 1333 1334 1335 lea SHIFT_MASK+16(%rip), %r12 1336 sub %r13, %r12 # adjust the shuffle mask pointer to be 1337 # able to shift 16-r13 bytes (r13 is the 1338 # number of bytes in plaintext mod 16) 1339 1340_get_last_16_byte_loop\@: 1341 movb (arg3, %r11), %al 1342 movb %al, TMP1 (%rsp , %r11) 1343 add $1, %r11 1344 cmp %r13, %r11 1345 jne _get_last_16_byte_loop\@ 1346 1347 vmovdqu TMP1(%rsp), %xmm1 1348 1349 sub $16, %r11 1350 1351_final_ghash_mul\@: 1352 .if \ENC_DEC == DEC 1353 vmovdqa %xmm1, %xmm2 1354 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 1355 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 1356 # mask out top 16-r13 bytes of xmm9 1357 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 1358 vpand %xmm1, %xmm2, %xmm2 1359 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 1360 vpxor %xmm2, %xmm14, %xmm14 1361 #GHASH computation for the last <16 Byte block 1362 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 1363 sub %r13, %r11 1364 add $16, %r11 1365 .else 1366 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 1367 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 1368 # mask out top 16-r13 bytes of xmm9 1369 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 1370 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1371 vpxor %xmm9, %xmm14, %xmm14 1372 #GHASH computation for the last <16 Byte block 1373 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 1374 sub %r13, %r11 1375 add $16, %r11 1376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 1377 .endif 1378 1379 1380 ############################# 1381 # output r13 Bytes 1382 vmovq %xmm9, %rax 1383 cmp $8, %r13 1384 jle _less_than_8_bytes_left\@ 1385 1386 mov %rax, (arg2 , %r11) 1387 add $8, %r11 1388 vpsrldq $8, %xmm9, %xmm9 1389 vmovq %xmm9, %rax 1390 sub $8, %r13 1391 1392_less_than_8_bytes_left\@: 1393 movb %al, (arg2 , %r11) 1394 add $1, %r11 1395 shr $8, %rax 1396 sub $1, %r13 1397 jne _less_than_8_bytes_left\@ 1398 ############################# 1399 1400_multiple_of_16_bytes\@: 1401 mov arg7, %r12 # r12 = aadLen (number of bytes) 1402 shl $3, %r12 # convert into number of bits 1403 vmovd %r12d, %xmm15 # len(A) in xmm15 1404 1405 shl $3, arg4 # len(C) in bits (*128) 1406 vmovq arg4, %xmm1 1407 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 1408 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 1409 1410 vpxor %xmm15, %xmm14, %xmm14 1411 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 1412 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 1413 1414 mov arg5, %rax # rax = *Y0 1415 vmovdqu (%rax), %xmm9 # xmm9 = Y0 1416 1417 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) 1418 1419 vpxor %xmm14, %xmm9, %xmm9 1420 1421 1422 1423_return_T\@: 1424 mov arg8, %r10 # r10 = authTag 1425 mov arg9, %r11 # r11 = auth_tag_len 1426 1427 cmp $16, %r11 1428 je _T_16\@ 1429 1430 cmp $12, %r11 1431 je _T_12\@ 1432 1433_T_8\@: 1434 vmovq %xmm9, %rax 1435 mov %rax, (%r10) 1436 jmp _return_T_done\@ 1437_T_12\@: 1438 vmovq %xmm9, %rax 1439 mov %rax, (%r10) 1440 vpsrldq $8, %xmm9, %xmm9 1441 vmovd %xmm9, %eax 1442 mov %eax, 8(%r10) 1443 jmp _return_T_done\@ 1444 1445_T_16\@: 1446 vmovdqu %xmm9, (%r10) 1447 1448_return_T_done\@: 1449 mov %r14, %rsp 1450 1451 pop %r15 1452 pop %r14 1453 pop %r13 1454 pop %r12 1455.endm 1456 1457 1458############################################################# 1459#void aesni_gcm_precomp_avx_gen2 1460# (gcm_data *my_ctx_data, 1461# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1462############################################################# 1463ENTRY(aesni_gcm_precomp_avx_gen2) 1464 #the number of pushes must equal STACK_OFFSET 1465 push %r12 1466 push %r13 1467 push %r14 1468 push %r15 1469 1470 mov %rsp, %r14 1471 1472 1473 1474 sub $VARIABLE_OFFSET, %rsp 1475 and $~63, %rsp # align rsp to 64 bytes 1476 1477 vmovdqu (arg2), %xmm6 # xmm6 = HashKey 1478 1479 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 1480 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 1481 vmovdqa %xmm6, %xmm2 1482 vpsllq $1, %xmm6, %xmm6 1483 vpsrlq $63, %xmm2, %xmm2 1484 vmovdqa %xmm2, %xmm1 1485 vpslldq $8, %xmm2, %xmm2 1486 vpsrldq $8, %xmm1, %xmm1 1487 vpor %xmm2, %xmm6, %xmm6 1488 #reduction 1489 vpshufd $0b00100100, %xmm1, %xmm2 1490 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 1491 vpand POLY(%rip), %xmm2, %xmm2 1492 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 1493 ####################################################################### 1494 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly 1495 1496 1497 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 1498 1499 mov %r14, %rsp 1500 1501 pop %r15 1502 pop %r14 1503 pop %r13 1504 pop %r12 1505 ret 1506ENDPROC(aesni_gcm_precomp_avx_gen2) 1507 1508############################################################################### 1509#void aesni_gcm_enc_avx_gen2( 1510# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1511# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1512# const u8 *in, /* Plaintext input */ 1513# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 1514# u8 *iv, /* Pre-counter block j0: 4 byte salt 1515# (from Security Association) concatenated with 8 byte 1516# Initialisation Vector (from IPSec ESP Payload) 1517# concatenated with 0x00000001. 16-byte aligned pointer. */ 1518# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1519# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1520# u8 *auth_tag, /* Authenticated Tag output. */ 1521# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1522# Valid values are 16 (most likely), 12 or 8. */ 1523############################################################################### 1524ENTRY(aesni_gcm_enc_avx_gen2) 1525 GCM_ENC_DEC_AVX ENC 1526 ret 1527ENDPROC(aesni_gcm_enc_avx_gen2) 1528 1529############################################################################### 1530#void aesni_gcm_dec_avx_gen2( 1531# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1532# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1533# const u8 *in, /* Ciphertext input */ 1534# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 1535# u8 *iv, /* Pre-counter block j0: 4 byte salt 1536# (from Security Association) concatenated with 8 byte 1537# Initialisation Vector (from IPSec ESP Payload) 1538# concatenated with 0x00000001. 16-byte aligned pointer. */ 1539# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1540# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1541# u8 *auth_tag, /* Authenticated Tag output. */ 1542# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1543# Valid values are 16 (most likely), 12 or 8. */ 1544############################################################################### 1545ENTRY(aesni_gcm_dec_avx_gen2) 1546 GCM_ENC_DEC_AVX DEC 1547 ret 1548ENDPROC(aesni_gcm_dec_avx_gen2) 1549#endif /* CONFIG_AS_AVX */ 1550 1551#ifdef CONFIG_AS_AVX2 1552############################################################################### 1553# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1554# Input: A and B (128-bits each, bit-reflected) 1555# Output: C = A*B*x mod poly, (i.e. >>1 ) 1556# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1557# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1558############################################################################### 1559.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1560 1561 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1562 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1563 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1564 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1565 vpxor \T3, \GH, \GH 1566 1567 1568 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1569 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1570 1571 vpxor \T3, \T1, \T1 1572 vpxor \T2, \GH, \GH 1573 1574 ####################################################################### 1575 #first phase of the reduction 1576 vmovdqa POLY2(%rip), \T3 1577 1578 vpclmulqdq $0x01, \GH, \T3, \T2 1579 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1580 1581 vpxor \T2, \GH, \GH # first phase of the reduction complete 1582 ####################################################################### 1583 #second phase of the reduction 1584 vpclmulqdq $0x00, \GH, \T3, \T2 1585 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1586 1587 vpclmulqdq $0x10, \GH, \T3, \GH 1588 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1589 1590 vpxor \T2, \GH, \GH # second phase of the reduction complete 1591 ####################################################################### 1592 vpxor \T1, \GH, \GH # the result is in GH 1593 1594 1595.endm 1596 1597.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1598 1599 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1600 vmovdqa \HK, \T5 1601 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1602 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly 1603 1604 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1605 vmovdqa \T5, HashKey_3(arg1) 1606 1607 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1608 vmovdqa \T5, HashKey_4(arg1) 1609 1610 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1611 vmovdqa \T5, HashKey_5(arg1) 1612 1613 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1614 vmovdqa \T5, HashKey_6(arg1) 1615 1616 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1617 vmovdqa \T5, HashKey_7(arg1) 1618 1619 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1620 vmovdqa \T5, HashKey_8(arg1) 1621 1622.endm 1623 1624 1625## if a = number of total plaintext bytes 1626## b = floor(a/16) 1627## num_initial_blocks = b mod 4# 1628## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1629## r10, r11, r12, rax are clobbered 1630## arg1, arg2, arg3, r14 are used as a pointer only, not modified 1631 1632.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1633 i = (8-\num_initial_blocks) 1634 setreg 1635 1636 mov arg6, %r10 # r10 = AAD 1637 mov arg7, %r12 # r12 = aadLen 1638 1639 1640 mov %r12, %r11 1641 1642 vpxor reg_i, reg_i, reg_i 1643_get_AAD_loop\@: 1644 vmovd (%r10), \T1 1645 vpslldq $12, \T1, \T1 1646 vpsrldq $4, reg_i, reg_i 1647 vpxor \T1, reg_i, reg_i 1648 1649 add $4, %r10 1650 sub $4, %r12 1651 jg _get_AAD_loop\@ 1652 1653 1654 cmp $16, %r11 1655 je _get_AAD_loop2_done\@ 1656 mov $16, %r12 1657 1658_get_AAD_loop2\@: 1659 vpsrldq $4, reg_i, reg_i 1660 sub $4, %r12 1661 cmp %r11, %r12 1662 jg _get_AAD_loop2\@ 1663 1664_get_AAD_loop2_done\@: 1665 1666 #byte-reflect the AAD data 1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i 1668 1669 # initialize the data pointer offset as zero 1670 xor %r11, %r11 1671 1672 # start AES for num_initial_blocks blocks 1673 mov arg5, %rax # rax = *Y0 1674 vmovdqu (%rax), \CTR # CTR = Y0 1675 vpshufb SHUF_MASK(%rip), \CTR, \CTR 1676 1677 1678 i = (9-\num_initial_blocks) 1679 setreg 1680.rep \num_initial_blocks 1681 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1682 vmovdqa \CTR, reg_i 1683 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1684 i = (i+1) 1685 setreg 1686.endr 1687 1688 vmovdqa (arg1), \T_key 1689 i = (9-\num_initial_blocks) 1690 setreg 1691.rep \num_initial_blocks 1692 vpxor \T_key, reg_i, reg_i 1693 i = (i+1) 1694 setreg 1695.endr 1696 1697 j = 1 1698 setreg 1699.rep 9 1700 vmovdqa 16*j(arg1), \T_key 1701 i = (9-\num_initial_blocks) 1702 setreg 1703.rep \num_initial_blocks 1704 vaesenc \T_key, reg_i, reg_i 1705 i = (i+1) 1706 setreg 1707.endr 1708 1709 j = (j+1) 1710 setreg 1711.endr 1712 1713 1714 vmovdqa 16*10(arg1), \T_key 1715 i = (9-\num_initial_blocks) 1716 setreg 1717.rep \num_initial_blocks 1718 vaesenclast \T_key, reg_i, reg_i 1719 i = (i+1) 1720 setreg 1721.endr 1722 1723 i = (9-\num_initial_blocks) 1724 setreg 1725.rep \num_initial_blocks 1726 vmovdqu (arg3, %r11), \T1 1727 vpxor \T1, reg_i, reg_i 1728 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for 1729 # num_initial_blocks blocks 1730 add $16, %r11 1731.if \ENC_DEC == DEC 1732 vmovdqa \T1, reg_i 1733.endif 1734 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1735 i = (i+1) 1736 setreg 1737.endr 1738 1739 1740 i = (8-\num_initial_blocks) 1741 j = (9-\num_initial_blocks) 1742 setreg 1743 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 1744 1745.rep \num_initial_blocks 1746 vpxor reg_i, reg_j, reg_j 1747 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1748 i = (i+1) 1749 j = (j+1) 1750 setreg 1751.endr 1752 # XMM8 has the combined result here 1753 1754 vmovdqa \XMM8, TMP1(%rsp) 1755 vmovdqa \XMM8, \T3 1756 1757 cmp $128, %r13 1758 jl _initial_blocks_done\@ # no need for precomputed constants 1759 1760############################################################################### 1761# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1763 vmovdqa \CTR, \XMM1 1764 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1765 1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1767 vmovdqa \CTR, \XMM2 1768 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1769 1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1771 vmovdqa \CTR, \XMM3 1772 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1773 1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1775 vmovdqa \CTR, \XMM4 1776 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1777 1778 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1779 vmovdqa \CTR, \XMM5 1780 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1781 1782 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1783 vmovdqa \CTR, \XMM6 1784 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1785 1786 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1787 vmovdqa \CTR, \XMM7 1788 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1789 1790 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1791 vmovdqa \CTR, \XMM8 1792 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1793 1794 vmovdqa (arg1), \T_key 1795 vpxor \T_key, \XMM1, \XMM1 1796 vpxor \T_key, \XMM2, \XMM2 1797 vpxor \T_key, \XMM3, \XMM3 1798 vpxor \T_key, \XMM4, \XMM4 1799 vpxor \T_key, \XMM5, \XMM5 1800 vpxor \T_key, \XMM6, \XMM6 1801 vpxor \T_key, \XMM7, \XMM7 1802 vpxor \T_key, \XMM8, \XMM8 1803 1804 i = 1 1805 setreg 1806.rep 9 # do 9 rounds 1807 vmovdqa 16*i(arg1), \T_key 1808 vaesenc \T_key, \XMM1, \XMM1 1809 vaesenc \T_key, \XMM2, \XMM2 1810 vaesenc \T_key, \XMM3, \XMM3 1811 vaesenc \T_key, \XMM4, \XMM4 1812 vaesenc \T_key, \XMM5, \XMM5 1813 vaesenc \T_key, \XMM6, \XMM6 1814 vaesenc \T_key, \XMM7, \XMM7 1815 vaesenc \T_key, \XMM8, \XMM8 1816 i = (i+1) 1817 setreg 1818.endr 1819 1820 1821 vmovdqa 16*i(arg1), \T_key 1822 vaesenclast \T_key, \XMM1, \XMM1 1823 vaesenclast \T_key, \XMM2, \XMM2 1824 vaesenclast \T_key, \XMM3, \XMM3 1825 vaesenclast \T_key, \XMM4, \XMM4 1826 vaesenclast \T_key, \XMM5, \XMM5 1827 vaesenclast \T_key, \XMM6, \XMM6 1828 vaesenclast \T_key, \XMM7, \XMM7 1829 vaesenclast \T_key, \XMM8, \XMM8 1830 1831 vmovdqu (arg3, %r11), \T1 1832 vpxor \T1, \XMM1, \XMM1 1833 vmovdqu \XMM1, (arg2 , %r11) 1834 .if \ENC_DEC == DEC 1835 vmovdqa \T1, \XMM1 1836 .endif 1837 1838 vmovdqu 16*1(arg3, %r11), \T1 1839 vpxor \T1, \XMM2, \XMM2 1840 vmovdqu \XMM2, 16*1(arg2 , %r11) 1841 .if \ENC_DEC == DEC 1842 vmovdqa \T1, \XMM2 1843 .endif 1844 1845 vmovdqu 16*2(arg3, %r11), \T1 1846 vpxor \T1, \XMM3, \XMM3 1847 vmovdqu \XMM3, 16*2(arg2 , %r11) 1848 .if \ENC_DEC == DEC 1849 vmovdqa \T1, \XMM3 1850 .endif 1851 1852 vmovdqu 16*3(arg3, %r11), \T1 1853 vpxor \T1, \XMM4, \XMM4 1854 vmovdqu \XMM4, 16*3(arg2 , %r11) 1855 .if \ENC_DEC == DEC 1856 vmovdqa \T1, \XMM4 1857 .endif 1858 1859 vmovdqu 16*4(arg3, %r11), \T1 1860 vpxor \T1, \XMM5, \XMM5 1861 vmovdqu \XMM5, 16*4(arg2 , %r11) 1862 .if \ENC_DEC == DEC 1863 vmovdqa \T1, \XMM5 1864 .endif 1865 1866 vmovdqu 16*5(arg3, %r11), \T1 1867 vpxor \T1, \XMM6, \XMM6 1868 vmovdqu \XMM6, 16*5(arg2 , %r11) 1869 .if \ENC_DEC == DEC 1870 vmovdqa \T1, \XMM6 1871 .endif 1872 1873 vmovdqu 16*6(arg3, %r11), \T1 1874 vpxor \T1, \XMM7, \XMM7 1875 vmovdqu \XMM7, 16*6(arg2 , %r11) 1876 .if \ENC_DEC == DEC 1877 vmovdqa \T1, \XMM7 1878 .endif 1879 1880 vmovdqu 16*7(arg3, %r11), \T1 1881 vpxor \T1, \XMM8, \XMM8 1882 vmovdqu \XMM8, 16*7(arg2 , %r11) 1883 .if \ENC_DEC == DEC 1884 vmovdqa \T1, \XMM8 1885 .endif 1886 1887 add $128, %r11 1888 1889 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1890 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 1891 # the corresponding ciphertext 1892 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1893 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1894 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1895 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1896 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1897 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1898 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1899 1900############################################################################### 1901 1902_initial_blocks_done\@: 1903 1904 1905.endm 1906 1907 1908 1909# encrypt 8 blocks at a time 1910# ghash the 8 previously encrypted ciphertext blocks 1911# arg1, arg2, arg3 are used as pointers only, not modified 1912# r11 is the data offset value 1913.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1914 1915 vmovdqa \XMM1, \T2 1916 vmovdqa \XMM2, TMP2(%rsp) 1917 vmovdqa \XMM3, TMP3(%rsp) 1918 vmovdqa \XMM4, TMP4(%rsp) 1919 vmovdqa \XMM5, TMP5(%rsp) 1920 vmovdqa \XMM6, TMP6(%rsp) 1921 vmovdqa \XMM7, TMP7(%rsp) 1922 vmovdqa \XMM8, TMP8(%rsp) 1923 1924.if \loop_idx == in_order 1925 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1926 vpaddd ONE(%rip), \XMM1, \XMM2 1927 vpaddd ONE(%rip), \XMM2, \XMM3 1928 vpaddd ONE(%rip), \XMM3, \XMM4 1929 vpaddd ONE(%rip), \XMM4, \XMM5 1930 vpaddd ONE(%rip), \XMM5, \XMM6 1931 vpaddd ONE(%rip), \XMM6, \XMM7 1932 vpaddd ONE(%rip), \XMM7, \XMM8 1933 vmovdqa \XMM8, \CTR 1934 1935 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1936 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1937 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1938 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1939 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1940 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1941 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1942 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1943.else 1944 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1945 vpaddd ONEf(%rip), \XMM1, \XMM2 1946 vpaddd ONEf(%rip), \XMM2, \XMM3 1947 vpaddd ONEf(%rip), \XMM3, \XMM4 1948 vpaddd ONEf(%rip), \XMM4, \XMM5 1949 vpaddd ONEf(%rip), \XMM5, \XMM6 1950 vpaddd ONEf(%rip), \XMM6, \XMM7 1951 vpaddd ONEf(%rip), \XMM7, \XMM8 1952 vmovdqa \XMM8, \CTR 1953.endif 1954 1955 1956 ####################################################################### 1957 1958 vmovdqu (arg1), \T1 1959 vpxor \T1, \XMM1, \XMM1 1960 vpxor \T1, \XMM2, \XMM2 1961 vpxor \T1, \XMM3, \XMM3 1962 vpxor \T1, \XMM4, \XMM4 1963 vpxor \T1, \XMM5, \XMM5 1964 vpxor \T1, \XMM6, \XMM6 1965 vpxor \T1, \XMM7, \XMM7 1966 vpxor \T1, \XMM8, \XMM8 1967 1968 ####################################################################### 1969 1970 1971 1972 1973 1974 vmovdqu 16*1(arg1), \T1 1975 vaesenc \T1, \XMM1, \XMM1 1976 vaesenc \T1, \XMM2, \XMM2 1977 vaesenc \T1, \XMM3, \XMM3 1978 vaesenc \T1, \XMM4, \XMM4 1979 vaesenc \T1, \XMM5, \XMM5 1980 vaesenc \T1, \XMM6, \XMM6 1981 vaesenc \T1, \XMM7, \XMM7 1982 vaesenc \T1, \XMM8, \XMM8 1983 1984 vmovdqu 16*2(arg1), \T1 1985 vaesenc \T1, \XMM1, \XMM1 1986 vaesenc \T1, \XMM2, \XMM2 1987 vaesenc \T1, \XMM3, \XMM3 1988 vaesenc \T1, \XMM4, \XMM4 1989 vaesenc \T1, \XMM5, \XMM5 1990 vaesenc \T1, \XMM6, \XMM6 1991 vaesenc \T1, \XMM7, \XMM7 1992 vaesenc \T1, \XMM8, \XMM8 1993 1994 1995 ####################################################################### 1996 1997 vmovdqa HashKey_8(arg1), \T5 1998 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1999 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2000 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2001 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2002 vpxor \T5, \T6, \T6 2003 2004 vmovdqu 16*3(arg1), \T1 2005 vaesenc \T1, \XMM1, \XMM1 2006 vaesenc \T1, \XMM2, \XMM2 2007 vaesenc \T1, \XMM3, \XMM3 2008 vaesenc \T1, \XMM4, \XMM4 2009 vaesenc \T1, \XMM5, \XMM5 2010 vaesenc \T1, \XMM6, \XMM6 2011 vaesenc \T1, \XMM7, \XMM7 2012 vaesenc \T1, \XMM8, \XMM8 2013 2014 vmovdqa TMP2(%rsp), \T1 2015 vmovdqa HashKey_7(arg1), \T5 2016 vpclmulqdq $0x11, \T5, \T1, \T3 2017 vpxor \T3, \T4, \T4 2018 2019 vpclmulqdq $0x00, \T5, \T1, \T3 2020 vpxor \T3, \T7, \T7 2021 2022 vpclmulqdq $0x01, \T5, \T1, \T3 2023 vpxor \T3, \T6, \T6 2024 2025 vpclmulqdq $0x10, \T5, \T1, \T3 2026 vpxor \T3, \T6, \T6 2027 2028 vmovdqu 16*4(arg1), \T1 2029 vaesenc \T1, \XMM1, \XMM1 2030 vaesenc \T1, \XMM2, \XMM2 2031 vaesenc \T1, \XMM3, \XMM3 2032 vaesenc \T1, \XMM4, \XMM4 2033 vaesenc \T1, \XMM5, \XMM5 2034 vaesenc \T1, \XMM6, \XMM6 2035 vaesenc \T1, \XMM7, \XMM7 2036 vaesenc \T1, \XMM8, \XMM8 2037 2038 ####################################################################### 2039 2040 vmovdqa TMP3(%rsp), \T1 2041 vmovdqa HashKey_6(arg1), \T5 2042 vpclmulqdq $0x11, \T5, \T1, \T3 2043 vpxor \T3, \T4, \T4 2044 2045 vpclmulqdq $0x00, \T5, \T1, \T3 2046 vpxor \T3, \T7, \T7 2047 2048 vpclmulqdq $0x01, \T5, \T1, \T3 2049 vpxor \T3, \T6, \T6 2050 2051 vpclmulqdq $0x10, \T5, \T1, \T3 2052 vpxor \T3, \T6, \T6 2053 2054 vmovdqu 16*5(arg1), \T1 2055 vaesenc \T1, \XMM1, \XMM1 2056 vaesenc \T1, \XMM2, \XMM2 2057 vaesenc \T1, \XMM3, \XMM3 2058 vaesenc \T1, \XMM4, \XMM4 2059 vaesenc \T1, \XMM5, \XMM5 2060 vaesenc \T1, \XMM6, \XMM6 2061 vaesenc \T1, \XMM7, \XMM7 2062 vaesenc \T1, \XMM8, \XMM8 2063 2064 vmovdqa TMP4(%rsp), \T1 2065 vmovdqa HashKey_5(arg1), \T5 2066 vpclmulqdq $0x11, \T5, \T1, \T3 2067 vpxor \T3, \T4, \T4 2068 2069 vpclmulqdq $0x00, \T5, \T1, \T3 2070 vpxor \T3, \T7, \T7 2071 2072 vpclmulqdq $0x01, \T5, \T1, \T3 2073 vpxor \T3, \T6, \T6 2074 2075 vpclmulqdq $0x10, \T5, \T1, \T3 2076 vpxor \T3, \T6, \T6 2077 2078 vmovdqu 16*6(arg1), \T1 2079 vaesenc \T1, \XMM1, \XMM1 2080 vaesenc \T1, \XMM2, \XMM2 2081 vaesenc \T1, \XMM3, \XMM3 2082 vaesenc \T1, \XMM4, \XMM4 2083 vaesenc \T1, \XMM5, \XMM5 2084 vaesenc \T1, \XMM6, \XMM6 2085 vaesenc \T1, \XMM7, \XMM7 2086 vaesenc \T1, \XMM8, \XMM8 2087 2088 2089 vmovdqa TMP5(%rsp), \T1 2090 vmovdqa HashKey_4(arg1), \T5 2091 vpclmulqdq $0x11, \T5, \T1, \T3 2092 vpxor \T3, \T4, \T4 2093 2094 vpclmulqdq $0x00, \T5, \T1, \T3 2095 vpxor \T3, \T7, \T7 2096 2097 vpclmulqdq $0x01, \T5, \T1, \T3 2098 vpxor \T3, \T6, \T6 2099 2100 vpclmulqdq $0x10, \T5, \T1, \T3 2101 vpxor \T3, \T6, \T6 2102 2103 vmovdqu 16*7(arg1), \T1 2104 vaesenc \T1, \XMM1, \XMM1 2105 vaesenc \T1, \XMM2, \XMM2 2106 vaesenc \T1, \XMM3, \XMM3 2107 vaesenc \T1, \XMM4, \XMM4 2108 vaesenc \T1, \XMM5, \XMM5 2109 vaesenc \T1, \XMM6, \XMM6 2110 vaesenc \T1, \XMM7, \XMM7 2111 vaesenc \T1, \XMM8, \XMM8 2112 2113 vmovdqa TMP6(%rsp), \T1 2114 vmovdqa HashKey_3(arg1), \T5 2115 vpclmulqdq $0x11, \T5, \T1, \T3 2116 vpxor \T3, \T4, \T4 2117 2118 vpclmulqdq $0x00, \T5, \T1, \T3 2119 vpxor \T3, \T7, \T7 2120 2121 vpclmulqdq $0x01, \T5, \T1, \T3 2122 vpxor \T3, \T6, \T6 2123 2124 vpclmulqdq $0x10, \T5, \T1, \T3 2125 vpxor \T3, \T6, \T6 2126 2127 vmovdqu 16*8(arg1), \T1 2128 vaesenc \T1, \XMM1, \XMM1 2129 vaesenc \T1, \XMM2, \XMM2 2130 vaesenc \T1, \XMM3, \XMM3 2131 vaesenc \T1, \XMM4, \XMM4 2132 vaesenc \T1, \XMM5, \XMM5 2133 vaesenc \T1, \XMM6, \XMM6 2134 vaesenc \T1, \XMM7, \XMM7 2135 vaesenc \T1, \XMM8, \XMM8 2136 2137 vmovdqa TMP7(%rsp), \T1 2138 vmovdqa HashKey_2(arg1), \T5 2139 vpclmulqdq $0x11, \T5, \T1, \T3 2140 vpxor \T3, \T4, \T4 2141 2142 vpclmulqdq $0x00, \T5, \T1, \T3 2143 vpxor \T3, \T7, \T7 2144 2145 vpclmulqdq $0x01, \T5, \T1, \T3 2146 vpxor \T3, \T6, \T6 2147 2148 vpclmulqdq $0x10, \T5, \T1, \T3 2149 vpxor \T3, \T6, \T6 2150 2151 2152 ####################################################################### 2153 2154 vmovdqu 16*9(arg1), \T5 2155 vaesenc \T5, \XMM1, \XMM1 2156 vaesenc \T5, \XMM2, \XMM2 2157 vaesenc \T5, \XMM3, \XMM3 2158 vaesenc \T5, \XMM4, \XMM4 2159 vaesenc \T5, \XMM5, \XMM5 2160 vaesenc \T5, \XMM6, \XMM6 2161 vaesenc \T5, \XMM7, \XMM7 2162 vaesenc \T5, \XMM8, \XMM8 2163 2164 vmovdqa TMP8(%rsp), \T1 2165 vmovdqa HashKey(arg1), \T5 2166 2167 vpclmulqdq $0x00, \T5, \T1, \T3 2168 vpxor \T3, \T7, \T7 2169 2170 vpclmulqdq $0x01, \T5, \T1, \T3 2171 vpxor \T3, \T6, \T6 2172 2173 vpclmulqdq $0x10, \T5, \T1, \T3 2174 vpxor \T3, \T6, \T6 2175 2176 vpclmulqdq $0x11, \T5, \T1, \T3 2177 vpxor \T3, \T4, \T1 2178 2179 2180 vmovdqu 16*10(arg1), \T5 2181 2182 i = 0 2183 j = 1 2184 setreg 2185.rep 8 2186 vpxor 16*i(arg3, %r11), \T5, \T2 2187 .if \ENC_DEC == ENC 2188 vaesenclast \T2, reg_j, reg_j 2189 .else 2190 vaesenclast \T2, reg_j, \T3 2191 vmovdqu 16*i(arg3, %r11), reg_j 2192 vmovdqu \T3, 16*i(arg2, %r11) 2193 .endif 2194 i = (i+1) 2195 j = (j+1) 2196 setreg 2197.endr 2198 ####################################################################### 2199 2200 2201 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2202 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2203 vpxor \T3, \T7, \T7 2204 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2205 2206 2207 2208 ####################################################################### 2209 #first phase of the reduction 2210 vmovdqa POLY2(%rip), \T3 2211 2212 vpclmulqdq $0x01, \T7, \T3, \T2 2213 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2214 2215 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2216 ####################################################################### 2217 .if \ENC_DEC == ENC 2218 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer 2219 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer 2220 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer 2221 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer 2222 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer 2223 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer 2224 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer 2225 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer 2226 .endif 2227 2228 ####################################################################### 2229 #second phase of the reduction 2230 vpclmulqdq $0x00, \T7, \T3, \T2 2231 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2232 2233 vpclmulqdq $0x10, \T7, \T3, \T4 2234 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2235 2236 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2237 ####################################################################### 2238 vpxor \T4, \T1, \T1 # the result is in T1 2239 2240 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2241 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2242 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2243 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2244 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2245 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2246 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2247 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2248 2249 2250 vpxor \T1, \XMM1, \XMM1 2251 2252 2253 2254.endm 2255 2256 2257# GHASH the last 4 ciphertext blocks. 2258.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2259 2260 ## Karatsuba Method 2261 2262 vmovdqa HashKey_8(arg1), \T5 2263 2264 vpshufd $0b01001110, \XMM1, \T2 2265 vpshufd $0b01001110, \T5, \T3 2266 vpxor \XMM1, \T2, \T2 2267 vpxor \T5, \T3, \T3 2268 2269 vpclmulqdq $0x11, \T5, \XMM1, \T6 2270 vpclmulqdq $0x00, \T5, \XMM1, \T7 2271 2272 vpclmulqdq $0x00, \T3, \T2, \XMM1 2273 2274 ###################### 2275 2276 vmovdqa HashKey_7(arg1), \T5 2277 vpshufd $0b01001110, \XMM2, \T2 2278 vpshufd $0b01001110, \T5, \T3 2279 vpxor \XMM2, \T2, \T2 2280 vpxor \T5, \T3, \T3 2281 2282 vpclmulqdq $0x11, \T5, \XMM2, \T4 2283 vpxor \T4, \T6, \T6 2284 2285 vpclmulqdq $0x00, \T5, \XMM2, \T4 2286 vpxor \T4, \T7, \T7 2287 2288 vpclmulqdq $0x00, \T3, \T2, \T2 2289 2290 vpxor \T2, \XMM1, \XMM1 2291 2292 ###################### 2293 2294 vmovdqa HashKey_6(arg1), \T5 2295 vpshufd $0b01001110, \XMM3, \T2 2296 vpshufd $0b01001110, \T5, \T3 2297 vpxor \XMM3, \T2, \T2 2298 vpxor \T5, \T3, \T3 2299 2300 vpclmulqdq $0x11, \T5, \XMM3, \T4 2301 vpxor \T4, \T6, \T6 2302 2303 vpclmulqdq $0x00, \T5, \XMM3, \T4 2304 vpxor \T4, \T7, \T7 2305 2306 vpclmulqdq $0x00, \T3, \T2, \T2 2307 2308 vpxor \T2, \XMM1, \XMM1 2309 2310 ###################### 2311 2312 vmovdqa HashKey_5(arg1), \T5 2313 vpshufd $0b01001110, \XMM4, \T2 2314 vpshufd $0b01001110, \T5, \T3 2315 vpxor \XMM4, \T2, \T2 2316 vpxor \T5, \T3, \T3 2317 2318 vpclmulqdq $0x11, \T5, \XMM4, \T4 2319 vpxor \T4, \T6, \T6 2320 2321 vpclmulqdq $0x00, \T5, \XMM4, \T4 2322 vpxor \T4, \T7, \T7 2323 2324 vpclmulqdq $0x00, \T3, \T2, \T2 2325 2326 vpxor \T2, \XMM1, \XMM1 2327 2328 ###################### 2329 2330 vmovdqa HashKey_4(arg1), \T5 2331 vpshufd $0b01001110, \XMM5, \T2 2332 vpshufd $0b01001110, \T5, \T3 2333 vpxor \XMM5, \T2, \T2 2334 vpxor \T5, \T3, \T3 2335 2336 vpclmulqdq $0x11, \T5, \XMM5, \T4 2337 vpxor \T4, \T6, \T6 2338 2339 vpclmulqdq $0x00, \T5, \XMM5, \T4 2340 vpxor \T4, \T7, \T7 2341 2342 vpclmulqdq $0x00, \T3, \T2, \T2 2343 2344 vpxor \T2, \XMM1, \XMM1 2345 2346 ###################### 2347 2348 vmovdqa HashKey_3(arg1), \T5 2349 vpshufd $0b01001110, \XMM6, \T2 2350 vpshufd $0b01001110, \T5, \T3 2351 vpxor \XMM6, \T2, \T2 2352 vpxor \T5, \T3, \T3 2353 2354 vpclmulqdq $0x11, \T5, \XMM6, \T4 2355 vpxor \T4, \T6, \T6 2356 2357 vpclmulqdq $0x00, \T5, \XMM6, \T4 2358 vpxor \T4, \T7, \T7 2359 2360 vpclmulqdq $0x00, \T3, \T2, \T2 2361 2362 vpxor \T2, \XMM1, \XMM1 2363 2364 ###################### 2365 2366 vmovdqa HashKey_2(arg1), \T5 2367 vpshufd $0b01001110, \XMM7, \T2 2368 vpshufd $0b01001110, \T5, \T3 2369 vpxor \XMM7, \T2, \T2 2370 vpxor \T5, \T3, \T3 2371 2372 vpclmulqdq $0x11, \T5, \XMM7, \T4 2373 vpxor \T4, \T6, \T6 2374 2375 vpclmulqdq $0x00, \T5, \XMM7, \T4 2376 vpxor \T4, \T7, \T7 2377 2378 vpclmulqdq $0x00, \T3, \T2, \T2 2379 2380 vpxor \T2, \XMM1, \XMM1 2381 2382 ###################### 2383 2384 vmovdqa HashKey(arg1), \T5 2385 vpshufd $0b01001110, \XMM8, \T2 2386 vpshufd $0b01001110, \T5, \T3 2387 vpxor \XMM8, \T2, \T2 2388 vpxor \T5, \T3, \T3 2389 2390 vpclmulqdq $0x11, \T5, \XMM8, \T4 2391 vpxor \T4, \T6, \T6 2392 2393 vpclmulqdq $0x00, \T5, \XMM8, \T4 2394 vpxor \T4, \T7, \T7 2395 2396 vpclmulqdq $0x00, \T3, \T2, \T2 2397 2398 vpxor \T2, \XMM1, \XMM1 2399 vpxor \T6, \XMM1, \XMM1 2400 vpxor \T7, \XMM1, \T2 2401 2402 2403 2404 2405 vpslldq $8, \T2, \T4 2406 vpsrldq $8, \T2, \T2 2407 2408 vpxor \T4, \T7, \T7 2409 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2410 # accumulated carry-less multiplications 2411 2412 ####################################################################### 2413 #first phase of the reduction 2414 vmovdqa POLY2(%rip), \T3 2415 2416 vpclmulqdq $0x01, \T7, \T3, \T2 2417 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2418 2419 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2420 ####################################################################### 2421 2422 2423 #second phase of the reduction 2424 vpclmulqdq $0x00, \T7, \T3, \T2 2425 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2426 2427 vpclmulqdq $0x10, \T7, \T3, \T4 2428 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2429 2430 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2431 ####################################################################### 2432 vpxor \T4, \T6, \T6 # the result is in T6 2433.endm 2434 2435 2436 2437# combined for GCM encrypt and decrypt functions 2438# clobbering all xmm registers 2439# clobbering r10, r11, r12, r13, r14, r15 2440.macro GCM_ENC_DEC_AVX2 ENC_DEC 2441 2442 #the number of pushes must equal STACK_OFFSET 2443 push %r12 2444 push %r13 2445 push %r14 2446 push %r15 2447 2448 mov %rsp, %r14 2449 2450 2451 2452 2453 sub $VARIABLE_OFFSET, %rsp 2454 and $~63, %rsp # align rsp to 64 bytes 2455 2456 2457 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey 2458 2459 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext 2460 and $-16, %r13 # r13 = r13 - (r13 mod 16) 2461 2462 mov %r13, %r12 2463 shr $4, %r12 2464 and $7, %r12 2465 jz _initial_num_blocks_is_0\@ 2466 2467 cmp $7, %r12 2468 je _initial_num_blocks_is_7\@ 2469 cmp $6, %r12 2470 je _initial_num_blocks_is_6\@ 2471 cmp $5, %r12 2472 je _initial_num_blocks_is_5\@ 2473 cmp $4, %r12 2474 je _initial_num_blocks_is_4\@ 2475 cmp $3, %r12 2476 je _initial_num_blocks_is_3\@ 2477 cmp $2, %r12 2478 je _initial_num_blocks_is_2\@ 2479 2480 jmp _initial_num_blocks_is_1\@ 2481 2482_initial_num_blocks_is_7\@: 2483 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2484 sub $16*7, %r13 2485 jmp _initial_blocks_encrypted\@ 2486 2487_initial_num_blocks_is_6\@: 2488 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2489 sub $16*6, %r13 2490 jmp _initial_blocks_encrypted\@ 2491 2492_initial_num_blocks_is_5\@: 2493 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2494 sub $16*5, %r13 2495 jmp _initial_blocks_encrypted\@ 2496 2497_initial_num_blocks_is_4\@: 2498 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2499 sub $16*4, %r13 2500 jmp _initial_blocks_encrypted\@ 2501 2502_initial_num_blocks_is_3\@: 2503 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2504 sub $16*3, %r13 2505 jmp _initial_blocks_encrypted\@ 2506 2507_initial_num_blocks_is_2\@: 2508 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2509 sub $16*2, %r13 2510 jmp _initial_blocks_encrypted\@ 2511 2512_initial_num_blocks_is_1\@: 2513 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2514 sub $16*1, %r13 2515 jmp _initial_blocks_encrypted\@ 2516 2517_initial_num_blocks_is_0\@: 2518 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2519 2520 2521_initial_blocks_encrypted\@: 2522 cmp $0, %r13 2523 je _zero_cipher_left\@ 2524 2525 sub $128, %r13 2526 je _eight_cipher_left\@ 2527 2528 2529 2530 2531 vmovd %xmm9, %r15d 2532 and $255, %r15d 2533 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2534 2535 2536_encrypt_by_8_new\@: 2537 cmp $(255-8), %r15d 2538 jg _encrypt_by_8\@ 2539 2540 2541 2542 add $8, %r15b 2543 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 2544 add $128, %r11 2545 sub $128, %r13 2546 jne _encrypt_by_8_new\@ 2547 2548 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2549 jmp _eight_cipher_left\@ 2550 2551_encrypt_by_8\@: 2552 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2553 add $8, %r15b 2554 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 2555 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2556 add $128, %r11 2557 sub $128, %r13 2558 jne _encrypt_by_8_new\@ 2559 2560 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2561 2562 2563 2564 2565_eight_cipher_left\@: 2566 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 2567 2568 2569_zero_cipher_left\@: 2570 cmp $16, arg4 2571 jl _only_less_than_16\@ 2572 2573 mov arg4, %r13 2574 and $15, %r13 # r13 = (arg4 mod 16) 2575 2576 je _multiple_of_16_bytes\@ 2577 2578 # handle the last <16 Byte block seperately 2579 2580 2581 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 2582 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2583 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 2584 2585 sub $16, %r11 2586 add %r13, %r11 2587 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block 2588 2589 lea SHIFT_MASK+16(%rip), %r12 2590 sub %r13, %r12 # adjust the shuffle mask pointer 2591 # to be able to shift 16-r13 bytes 2592 # (r13 is the number of bytes in plaintext mod 16) 2593 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 2594 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes 2595 jmp _final_ghash_mul\@ 2596 2597_only_less_than_16\@: 2598 # check for 0 length 2599 mov arg4, %r13 2600 and $15, %r13 # r13 = (arg4 mod 16) 2601 2602 je _multiple_of_16_bytes\@ 2603 2604 # handle the last <16 Byte block seperately 2605 2606 2607 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 2608 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2609 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 2610 2611 2612 lea SHIFT_MASK+16(%rip), %r12 2613 sub %r13, %r12 # adjust the shuffle mask pointer to be 2614 # able to shift 16-r13 bytes (r13 is the 2615 # number of bytes in plaintext mod 16) 2616 2617_get_last_16_byte_loop\@: 2618 movb (arg3, %r11), %al 2619 movb %al, TMP1 (%rsp , %r11) 2620 add $1, %r11 2621 cmp %r13, %r11 2622 jne _get_last_16_byte_loop\@ 2623 2624 vmovdqu TMP1(%rsp), %xmm1 2625 2626 sub $16, %r11 2627 2628_final_ghash_mul\@: 2629 .if \ENC_DEC == DEC 2630 vmovdqa %xmm1, %xmm2 2631 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 2632 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 2633 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 2634 vpand %xmm1, %xmm2, %xmm2 2635 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 2636 vpxor %xmm2, %xmm14, %xmm14 2637 #GHASH computation for the last <16 Byte block 2638 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 2639 sub %r13, %r11 2640 add $16, %r11 2641 .else 2642 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 2643 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 2644 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 2645 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2646 vpxor %xmm9, %xmm14, %xmm14 2647 #GHASH computation for the last <16 Byte block 2648 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 2649 sub %r13, %r11 2650 add $16, %r11 2651 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 2652 .endif 2653 2654 2655 ############################# 2656 # output r13 Bytes 2657 vmovq %xmm9, %rax 2658 cmp $8, %r13 2659 jle _less_than_8_bytes_left\@ 2660 2661 mov %rax, (arg2 , %r11) 2662 add $8, %r11 2663 vpsrldq $8, %xmm9, %xmm9 2664 vmovq %xmm9, %rax 2665 sub $8, %r13 2666 2667_less_than_8_bytes_left\@: 2668 movb %al, (arg2 , %r11) 2669 add $1, %r11 2670 shr $8, %rax 2671 sub $1, %r13 2672 jne _less_than_8_bytes_left\@ 2673 ############################# 2674 2675_multiple_of_16_bytes\@: 2676 mov arg7, %r12 # r12 = aadLen (number of bytes) 2677 shl $3, %r12 # convert into number of bits 2678 vmovd %r12d, %xmm15 # len(A) in xmm15 2679 2680 shl $3, arg4 # len(C) in bits (*128) 2681 vmovq arg4, %xmm1 2682 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 2683 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 2684 2685 vpxor %xmm15, %xmm14, %xmm14 2686 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 2687 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 2688 2689 mov arg5, %rax # rax = *Y0 2690 vmovdqu (%rax), %xmm9 # xmm9 = Y0 2691 2692 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) 2693 2694 vpxor %xmm14, %xmm9, %xmm9 2695 2696 2697 2698_return_T\@: 2699 mov arg8, %r10 # r10 = authTag 2700 mov arg9, %r11 # r11 = auth_tag_len 2701 2702 cmp $16, %r11 2703 je _T_16\@ 2704 2705 cmp $12, %r11 2706 je _T_12\@ 2707 2708_T_8\@: 2709 vmovq %xmm9, %rax 2710 mov %rax, (%r10) 2711 jmp _return_T_done\@ 2712_T_12\@: 2713 vmovq %xmm9, %rax 2714 mov %rax, (%r10) 2715 vpsrldq $8, %xmm9, %xmm9 2716 vmovd %xmm9, %eax 2717 mov %eax, 8(%r10) 2718 jmp _return_T_done\@ 2719 2720_T_16\@: 2721 vmovdqu %xmm9, (%r10) 2722 2723_return_T_done\@: 2724 mov %r14, %rsp 2725 2726 pop %r15 2727 pop %r14 2728 pop %r13 2729 pop %r12 2730.endm 2731 2732 2733############################################################# 2734#void aesni_gcm_precomp_avx_gen4 2735# (gcm_data *my_ctx_data, 2736# u8 *hash_subkey)# /* H, the Hash sub key input. 2737# Data starts on a 16-byte boundary. */ 2738############################################################# 2739ENTRY(aesni_gcm_precomp_avx_gen4) 2740 #the number of pushes must equal STACK_OFFSET 2741 push %r12 2742 push %r13 2743 push %r14 2744 push %r15 2745 2746 mov %rsp, %r14 2747 2748 2749 2750 sub $VARIABLE_OFFSET, %rsp 2751 and $~63, %rsp # align rsp to 64 bytes 2752 2753 vmovdqu (arg2), %xmm6 # xmm6 = HashKey 2754 2755 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 2756 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 2757 vmovdqa %xmm6, %xmm2 2758 vpsllq $1, %xmm6, %xmm6 2759 vpsrlq $63, %xmm2, %xmm2 2760 vmovdqa %xmm2, %xmm1 2761 vpslldq $8, %xmm2, %xmm2 2762 vpsrldq $8, %xmm1, %xmm1 2763 vpor %xmm2, %xmm6, %xmm6 2764 #reduction 2765 vpshufd $0b00100100, %xmm1, %xmm2 2766 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 2767 vpand POLY(%rip), %xmm2, %xmm2 2768 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 2769 ####################################################################### 2770 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly 2771 2772 2773 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 2774 2775 mov %r14, %rsp 2776 2777 pop %r15 2778 pop %r14 2779 pop %r13 2780 pop %r12 2781 ret 2782ENDPROC(aesni_gcm_precomp_avx_gen4) 2783 2784 2785############################################################################### 2786#void aesni_gcm_enc_avx_gen4( 2787# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2788# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2789# const u8 *in, /* Plaintext input */ 2790# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 2791# u8 *iv, /* Pre-counter block j0: 4 byte salt 2792# (from Security Association) concatenated with 8 byte 2793# Initialisation Vector (from IPSec ESP Payload) 2794# concatenated with 0x00000001. 16-byte aligned pointer. */ 2795# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2796# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2797# u8 *auth_tag, /* Authenticated Tag output. */ 2798# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2799# Valid values are 16 (most likely), 12 or 8. */ 2800############################################################################### 2801ENTRY(aesni_gcm_enc_avx_gen4) 2802 GCM_ENC_DEC_AVX2 ENC 2803 ret 2804ENDPROC(aesni_gcm_enc_avx_gen4) 2805 2806############################################################################### 2807#void aesni_gcm_dec_avx_gen4( 2808# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2809# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2810# const u8 *in, /* Ciphertext input */ 2811# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 2812# u8 *iv, /* Pre-counter block j0: 4 byte salt 2813# (from Security Association) concatenated with 8 byte 2814# Initialisation Vector (from IPSec ESP Payload) 2815# concatenated with 0x00000001. 16-byte aligned pointer. */ 2816# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2817# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2818# u8 *auth_tag, /* Authenticated Tag output. */ 2819# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2820# Valid values are 16 (most likely), 12 or 8. */ 2821############################################################################### 2822ENTRY(aesni_gcm_dec_avx_gen4) 2823 GCM_ENC_DEC_AVX2 DEC 2824 ret 2825ENDPROC(aesni_gcm_dec_avx_gen4) 2826 2827#endif /* CONFIG_AS_AVX2 */ 2828