1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123#include <asm/inst.h> 124 125# constants in mergeable sections, linker can reorder and merge 126.section .rodata.cst16.POLY, "aM", @progbits, 16 127.align 16 128POLY: .octa 0xC2000000000000000000000000000001 129 130.section .rodata.cst16.POLY2, "aM", @progbits, 16 131.align 16 132POLY2: .octa 0xC20000000000000000000001C2000000 133 134.section .rodata.cst16.TWOONE, "aM", @progbits, 16 135.align 16 136TWOONE: .octa 0x00000001000000000000000000000001 137 138.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 139.align 16 140SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 141 142.section .rodata.cst16.ONE, "aM", @progbits, 16 143.align 16 144ONE: .octa 0x00000000000000000000000000000001 145 146.section .rodata.cst16.ONEf, "aM", @progbits, 16 147.align 16 148ONEf: .octa 0x01000000000000000000000000000000 149 150# order of these constants should not change. 151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 152.section .rodata, "a", @progbits 153.align 16 154SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 155ALL_F: .octa 0xffffffffffffffffffffffffffffffff 156 .octa 0x00000000000000000000000000000000 157 158.section .rodata 159.align 16 160.type aad_shift_arr, @object 161.size aad_shift_arr, 272 162aad_shift_arr: 163 .octa 0xffffffffffffffffffffffffffffffff 164 .octa 0xffffffffffffffffffffffffffffff0C 165 .octa 0xffffffffffffffffffffffffffff0D0C 166 .octa 0xffffffffffffffffffffffffff0E0D0C 167 .octa 0xffffffffffffffffffffffff0F0E0D0C 168 .octa 0xffffffffffffffffffffff0C0B0A0908 169 .octa 0xffffffffffffffffffff0D0C0B0A0908 170 .octa 0xffffffffffffffffff0E0D0C0B0A0908 171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 172 .octa 0xffffffffffffff0C0B0A090807060504 173 .octa 0xffffffffffff0D0C0B0A090807060504 174 .octa 0xffffffffff0E0D0C0B0A090807060504 175 .octa 0xffffffff0F0E0D0C0B0A090807060504 176 .octa 0xffffff0C0B0A09080706050403020100 177 .octa 0xffff0D0C0B0A09080706050403020100 178 .octa 0xff0E0D0C0B0A09080706050403020100 179 .octa 0x0F0E0D0C0B0A09080706050403020100 180 181 182.text 183 184 185#define AadHash 16*0 186#define AadLen 16*1 187#define InLen (16*1)+8 188#define PBlockEncKey 16*2 189#define OrigIV 16*3 190#define CurCount 16*4 191#define PBlockLen 16*5 192 193HashKey = 16*6 # store HashKey <<1 mod poly here 194HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 195HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 196HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 197HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 198HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 199HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 200HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 201HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 202HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 203HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 204HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 205HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 206HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 207HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 208HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 209 210#define arg1 %rdi 211#define arg2 %rsi 212#define arg3 %rdx 213#define arg4 %rcx 214#define arg5 %r8 215#define arg6 %r9 216#define arg7 STACK_OFFSET+8*1(%r14) 217#define arg8 STACK_OFFSET+8*2(%r14) 218#define arg9 STACK_OFFSET+8*3(%r14) 219#define arg10 STACK_OFFSET+8*4(%r14) 220#define keysize 2*15*16(arg1) 221 222i = 0 223j = 0 224 225out_order = 0 226in_order = 1 227DEC = 0 228ENC = 1 229 230.macro define_reg r n 231reg_\r = %xmm\n 232.endm 233 234.macro setreg 235.altmacro 236define_reg i %i 237define_reg j %j 238.noaltmacro 239.endm 240 241# need to push 4 registers into stack to maintain 242STACK_OFFSET = 8*4 243 244TMP1 = 16*0 # Temporary storage for AAD 245TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 246TMP3 = 16*2 # Temporary storage for AES State 3 247TMP4 = 16*3 # Temporary storage for AES State 4 248TMP5 = 16*4 # Temporary storage for AES State 5 249TMP6 = 16*5 # Temporary storage for AES State 6 250TMP7 = 16*6 # Temporary storage for AES State 7 251TMP8 = 16*7 # Temporary storage for AES State 8 252 253VARIABLE_OFFSET = 16*8 254 255################################ 256# Utility Macros 257################################ 258 259.macro FUNC_SAVE 260 #the number of pushes must equal STACK_OFFSET 261 push %r12 262 push %r13 263 push %r14 264 push %r15 265 266 mov %rsp, %r14 267 268 269 270 sub $VARIABLE_OFFSET, %rsp 271 and $~63, %rsp # align rsp to 64 bytes 272.endm 273 274.macro FUNC_RESTORE 275 mov %r14, %rsp 276 277 pop %r15 278 pop %r14 279 pop %r13 280 pop %r12 281.endm 282 283# Encryption of a single block 284.macro ENCRYPT_SINGLE_BLOCK REP XMM0 285 vpxor (arg1), \XMM0, \XMM0 286 i = 1 287 setreg 288.rep \REP 289 vaesenc 16*i(arg1), \XMM0, \XMM0 290 i = (i+1) 291 setreg 292.endr 293 vaesenclast 16*i(arg1), \XMM0, \XMM0 294.endm 295 296# combined for GCM encrypt and decrypt functions 297# clobbering all xmm registers 298# clobbering r10, r11, r12, r13, r14, r15 299.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 300 vmovdqu AadHash(arg2), %xmm8 301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 302 add arg5, InLen(arg2) 303 304 # initialize the data pointer offset as zero 305 xor %r11d, %r11d 306 307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 308 sub %r11, arg5 309 310 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 311 and $-16, %r13 # r13 = r13 - (r13 mod 16) 312 313 mov %r13, %r12 314 shr $4, %r12 315 and $7, %r12 316 jz _initial_num_blocks_is_0\@ 317 318 cmp $7, %r12 319 je _initial_num_blocks_is_7\@ 320 cmp $6, %r12 321 je _initial_num_blocks_is_6\@ 322 cmp $5, %r12 323 je _initial_num_blocks_is_5\@ 324 cmp $4, %r12 325 je _initial_num_blocks_is_4\@ 326 cmp $3, %r12 327 je _initial_num_blocks_is_3\@ 328 cmp $2, %r12 329 je _initial_num_blocks_is_2\@ 330 331 jmp _initial_num_blocks_is_1\@ 332 333_initial_num_blocks_is_7\@: 334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 335 sub $16*7, %r13 336 jmp _initial_blocks_encrypted\@ 337 338_initial_num_blocks_is_6\@: 339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 340 sub $16*6, %r13 341 jmp _initial_blocks_encrypted\@ 342 343_initial_num_blocks_is_5\@: 344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 345 sub $16*5, %r13 346 jmp _initial_blocks_encrypted\@ 347 348_initial_num_blocks_is_4\@: 349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 350 sub $16*4, %r13 351 jmp _initial_blocks_encrypted\@ 352 353_initial_num_blocks_is_3\@: 354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 355 sub $16*3, %r13 356 jmp _initial_blocks_encrypted\@ 357 358_initial_num_blocks_is_2\@: 359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 360 sub $16*2, %r13 361 jmp _initial_blocks_encrypted\@ 362 363_initial_num_blocks_is_1\@: 364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 365 sub $16*1, %r13 366 jmp _initial_blocks_encrypted\@ 367 368_initial_num_blocks_is_0\@: 369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 370 371 372_initial_blocks_encrypted\@: 373 cmp $0, %r13 374 je _zero_cipher_left\@ 375 376 sub $128, %r13 377 je _eight_cipher_left\@ 378 379 380 381 382 vmovd %xmm9, %r15d 383 and $255, %r15d 384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 385 386 387_encrypt_by_8_new\@: 388 cmp $(255-8), %r15d 389 jg _encrypt_by_8\@ 390 391 392 393 add $8, %r15b 394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 395 add $128, %r11 396 sub $128, %r13 397 jne _encrypt_by_8_new\@ 398 399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 400 jmp _eight_cipher_left\@ 401 402_encrypt_by_8\@: 403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 404 add $8, %r15b 405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 407 add $128, %r11 408 sub $128, %r13 409 jne _encrypt_by_8_new\@ 410 411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 412 413 414 415 416_eight_cipher_left\@: 417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 418 419 420_zero_cipher_left\@: 421 vmovdqu %xmm14, AadHash(arg2) 422 vmovdqu %xmm9, CurCount(arg2) 423 424 # check for 0 length 425 mov arg5, %r13 426 and $15, %r13 # r13 = (arg5 mod 16) 427 428 je _multiple_of_16_bytes\@ 429 430 # handle the last <16 Byte block separately 431 432 mov %r13, PBlockLen(arg2) 433 434 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 435 vmovdqu %xmm9, CurCount(arg2) 436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 437 438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 439 vmovdqu %xmm9, PBlockEncKey(arg2) 440 441 cmp $16, arg5 442 jge _large_enough_update\@ 443 444 lea (arg4,%r11,1), %r10 445 mov %r13, %r12 446 447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 448 449 lea SHIFT_MASK+16(%rip), %r12 450 sub %r13, %r12 # adjust the shuffle mask pointer to be 451 # able to shift 16-r13 bytes (r13 is the 452 # number of bytes in plaintext mod 16) 453 454 jmp _final_ghash_mul\@ 455 456_large_enough_update\@: 457 sub $16, %r11 458 add %r13, %r11 459 460 # receive the last <16 Byte block 461 vmovdqu (arg4, %r11, 1), %xmm1 462 463 sub %r13, %r11 464 add $16, %r11 465 466 lea SHIFT_MASK+16(%rip), %r12 467 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 468 # (r13 is the number of bytes in plaintext mod 16) 469 sub %r13, %r12 470 # get the appropriate shuffle mask 471 vmovdqu (%r12), %xmm2 472 # shift right 16-r13 bytes 473 vpshufb %xmm2, %xmm1, %xmm1 474 475_final_ghash_mul\@: 476 .if \ENC_DEC == DEC 477 vmovdqa %xmm1, %xmm2 478 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 480 # mask out top 16-r13 bytes of xmm9 481 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 482 vpand %xmm1, %xmm2, %xmm2 483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 484 vpxor %xmm2, %xmm14, %xmm14 485 486 vmovdqu %xmm14, AadHash(arg2) 487 .else 488 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 490 # mask out top 16-r13 bytes of xmm9 491 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 493 vpxor %xmm9, %xmm14, %xmm14 494 495 vmovdqu %xmm14, AadHash(arg2) 496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 497 .endif 498 499 500 ############################# 501 # output r13 Bytes 502 vmovq %xmm9, %rax 503 cmp $8, %r13 504 jle _less_than_8_bytes_left\@ 505 506 mov %rax, (arg3 , %r11) 507 add $8, %r11 508 vpsrldq $8, %xmm9, %xmm9 509 vmovq %xmm9, %rax 510 sub $8, %r13 511 512_less_than_8_bytes_left\@: 513 movb %al, (arg3 , %r11) 514 add $1, %r11 515 shr $8, %rax 516 sub $1, %r13 517 jne _less_than_8_bytes_left\@ 518 ############################# 519 520_multiple_of_16_bytes\@: 521.endm 522 523 524# GCM_COMPLETE Finishes update of tag of last partial block 525# Output: Authorization Tag (AUTH_TAG) 526# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 527.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 528 vmovdqu AadHash(arg2), %xmm14 529 vmovdqu HashKey(arg2), %xmm13 530 531 mov PBlockLen(arg2), %r12 532 cmp $0, %r12 533 je _partial_done\@ 534 535 #GHASH computation for the last <16 Byte block 536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 537 538_partial_done\@: 539 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 540 shl $3, %r12 # convert into number of bits 541 vmovd %r12d, %xmm15 # len(A) in xmm15 542 543 mov InLen(arg2), %r12 544 shl $3, %r12 # len(C) in bits (*128) 545 vmovq %r12, %xmm1 546 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 547 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 548 549 vpxor %xmm15, %xmm14, %xmm14 550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 552 553 vmovdqu OrigIV(arg2), %xmm9 554 555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 556 557 vpxor %xmm14, %xmm9, %xmm9 558 559 560 561_return_T\@: 562 mov \AUTH_TAG, %r10 # r10 = authTag 563 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 564 565 cmp $16, %r11 566 je _T_16\@ 567 568 cmp $8, %r11 569 jl _T_4\@ 570 571_T_8\@: 572 vmovq %xmm9, %rax 573 mov %rax, (%r10) 574 add $8, %r10 575 sub $8, %r11 576 vpsrldq $8, %xmm9, %xmm9 577 cmp $0, %r11 578 je _return_T_done\@ 579_T_4\@: 580 vmovd %xmm9, %eax 581 mov %eax, (%r10) 582 add $4, %r10 583 sub $4, %r11 584 vpsrldq $4, %xmm9, %xmm9 585 cmp $0, %r11 586 je _return_T_done\@ 587_T_123\@: 588 vmovd %xmm9, %eax 589 cmp $2, %r11 590 jl _T_1\@ 591 mov %ax, (%r10) 592 cmp $2, %r11 593 je _return_T_done\@ 594 add $2, %r10 595 sar $16, %eax 596_T_1\@: 597 mov %al, (%r10) 598 jmp _return_T_done\@ 599 600_T_16\@: 601 vmovdqu %xmm9, (%r10) 602 603_return_T_done\@: 604.endm 605 606.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 607 608 mov \AAD, %r10 # r10 = AAD 609 mov \AADLEN, %r12 # r12 = aadLen 610 611 612 mov %r12, %r11 613 614 vpxor \T8, \T8, \T8 615 vpxor \T7, \T7, \T7 616 cmp $16, %r11 617 jl _get_AAD_rest8\@ 618_get_AAD_blocks\@: 619 vmovdqu (%r10), \T7 620 vpshufb SHUF_MASK(%rip), \T7, \T7 621 vpxor \T7, \T8, \T8 622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 623 add $16, %r10 624 sub $16, %r12 625 sub $16, %r11 626 cmp $16, %r11 627 jge _get_AAD_blocks\@ 628 vmovdqu \T8, \T7 629 cmp $0, %r11 630 je _get_AAD_done\@ 631 632 vpxor \T7, \T7, \T7 633 634 /* read the last <16B of AAD. since we have at least 4B of 635 data right after the AAD (the ICV, and maybe some CT), we can 636 read 4B/8B blocks safely, and then get rid of the extra stuff */ 637_get_AAD_rest8\@: 638 cmp $4, %r11 639 jle _get_AAD_rest4\@ 640 movq (%r10), \T1 641 add $8, %r10 642 sub $8, %r11 643 vpslldq $8, \T1, \T1 644 vpsrldq $8, \T7, \T7 645 vpxor \T1, \T7, \T7 646 jmp _get_AAD_rest8\@ 647_get_AAD_rest4\@: 648 cmp $0, %r11 649 jle _get_AAD_rest0\@ 650 mov (%r10), %eax 651 movq %rax, \T1 652 add $4, %r10 653 sub $4, %r11 654 vpslldq $12, \T1, \T1 655 vpsrldq $4, \T7, \T7 656 vpxor \T1, \T7, \T7 657_get_AAD_rest0\@: 658 /* finalize: shift out the extra bytes we read, and align 659 left. since pslldq can only shift by an immediate, we use 660 vpshufb and an array of shuffle masks */ 661 movq %r12, %r11 662 salq $4, %r11 663 vmovdqu aad_shift_arr(%r11), \T1 664 vpshufb \T1, \T7, \T7 665_get_AAD_rest_final\@: 666 vpshufb SHUF_MASK(%rip), \T7, \T7 667 vpxor \T8, \T7, \T7 668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 669 670_get_AAD_done\@: 671 vmovdqu \T7, AadHash(arg2) 672.endm 673 674.macro INIT GHASH_MUL PRECOMPUTE 675 mov arg6, %r11 676 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 677 xor %r11d, %r11d 678 mov %r11, InLen(arg2) # ctx_data.in_length = 0 679 680 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 681 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 682 mov arg3, %rax 683 movdqu (%rax), %xmm0 684 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 685 686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 687 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 688 689 vmovdqu (arg4), %xmm6 # xmm6 = HashKey 690 691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 692 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 693 vmovdqa %xmm6, %xmm2 694 vpsllq $1, %xmm6, %xmm6 695 vpsrlq $63, %xmm2, %xmm2 696 vmovdqa %xmm2, %xmm1 697 vpslldq $8, %xmm2, %xmm2 698 vpsrldq $8, %xmm1, %xmm1 699 vpor %xmm2, %xmm6, %xmm6 700 #reduction 701 vpshufd $0b00100100, %xmm1, %xmm2 702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 703 vpand POLY(%rip), %xmm2, %xmm2 704 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 705 ####################################################################### 706 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 707 708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 709 710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 711.endm 712 713 714# Reads DLEN bytes starting at DPTR and stores in XMMDst 715# where 0 < DLEN < 16 716# Clobbers %rax, DLEN 717.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 718 vpxor \XMMDst, \XMMDst, \XMMDst 719 720 cmp $8, \DLEN 721 jl _read_lt8_\@ 722 mov (\DPTR), %rax 723 vpinsrq $0, %rax, \XMMDst, \XMMDst 724 sub $8, \DLEN 725 jz _done_read_partial_block_\@ 726 xor %eax, %eax 727_read_next_byte_\@: 728 shl $8, %rax 729 mov 7(\DPTR, \DLEN, 1), %al 730 dec \DLEN 731 jnz _read_next_byte_\@ 732 vpinsrq $1, %rax, \XMMDst, \XMMDst 733 jmp _done_read_partial_block_\@ 734_read_lt8_\@: 735 xor %eax, %eax 736_read_next_byte_lt8_\@: 737 shl $8, %rax 738 mov -1(\DPTR, \DLEN, 1), %al 739 dec \DLEN 740 jnz _read_next_byte_lt8_\@ 741 vpinsrq $0, %rax, \XMMDst, \XMMDst 742_done_read_partial_block_\@: 743.endm 744 745# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 746# between update calls. 747# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 748# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 749# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 750.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 751 AAD_HASH ENC_DEC 752 mov PBlockLen(arg2), %r13 753 cmp $0, %r13 754 je _partial_block_done_\@ # Leave Macro if no partial blocks 755 # Read in input data without over reading 756 cmp $16, \PLAIN_CYPH_LEN 757 jl _fewer_than_16_bytes_\@ 758 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 759 jmp _data_read_\@ 760 761_fewer_than_16_bytes_\@: 762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 763 mov \PLAIN_CYPH_LEN, %r12 764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 765 766 mov PBlockLen(arg2), %r13 767 768_data_read_\@: # Finished reading in data 769 770 vmovdqu PBlockEncKey(arg2), %xmm9 771 vmovdqu HashKey(arg2), %xmm13 772 773 lea SHIFT_MASK(%rip), %r12 774 775 # adjust the shuffle mask pointer to be able to shift r13 bytes 776 # r16-r13 is the number of bytes in plaintext mod 16) 777 add %r13, %r12 778 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 779 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 780 781.if \ENC_DEC == DEC 782 vmovdqa %xmm1, %xmm3 783 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 784 785 mov \PLAIN_CYPH_LEN, %r10 786 add %r13, %r10 787 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 788 sub $16, %r10 789 # Determine if if partial block is not being filled and 790 # shift mask accordingly 791 jge _no_extra_mask_1_\@ 792 sub %r10, %r12 793_no_extra_mask_1_\@: 794 795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 796 # get the appropriate mask to mask out bottom r13 bytes of xmm9 797 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 798 799 vpand %xmm1, %xmm3, %xmm3 800 vmovdqa SHUF_MASK(%rip), %xmm10 801 vpshufb %xmm10, %xmm3, %xmm3 802 vpshufb %xmm2, %xmm3, %xmm3 803 vpxor %xmm3, \AAD_HASH, \AAD_HASH 804 805 cmp $0, %r10 806 jl _partial_incomplete_1_\@ 807 808 # GHASH computation for the last <16 Byte block 809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 810 xor %eax,%eax 811 812 mov %rax, PBlockLen(arg2) 813 jmp _dec_done_\@ 814_partial_incomplete_1_\@: 815 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 816_dec_done_\@: 817 vmovdqu \AAD_HASH, AadHash(arg2) 818.else 819 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 820 821 mov \PLAIN_CYPH_LEN, %r10 822 add %r13, %r10 823 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 824 sub $16, %r10 825 # Determine if if partial block is not being filled and 826 # shift mask accordingly 827 jge _no_extra_mask_2_\@ 828 sub %r10, %r12 829_no_extra_mask_2_\@: 830 831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 832 # get the appropriate mask to mask out bottom r13 bytes of xmm9 833 vpand %xmm1, %xmm9, %xmm9 834 835 vmovdqa SHUF_MASK(%rip), %xmm1 836 vpshufb %xmm1, %xmm9, %xmm9 837 vpshufb %xmm2, %xmm9, %xmm9 838 vpxor %xmm9, \AAD_HASH, \AAD_HASH 839 840 cmp $0, %r10 841 jl _partial_incomplete_2_\@ 842 843 # GHASH computation for the last <16 Byte block 844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 845 xor %eax,%eax 846 847 mov %rax, PBlockLen(arg2) 848 jmp _encode_done_\@ 849_partial_incomplete_2_\@: 850 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 851_encode_done_\@: 852 vmovdqu \AAD_HASH, AadHash(arg2) 853 854 vmovdqa SHUF_MASK(%rip), %xmm10 855 # shuffle xmm9 back to output as ciphertext 856 vpshufb %xmm10, %xmm9, %xmm9 857 vpshufb %xmm2, %xmm9, %xmm9 858.endif 859 # output encrypted Bytes 860 cmp $0, %r10 861 jl _partial_fill_\@ 862 mov %r13, %r12 863 mov $16, %r13 864 # Set r13 to be the number of bytes to write out 865 sub %r12, %r13 866 jmp _count_set_\@ 867_partial_fill_\@: 868 mov \PLAIN_CYPH_LEN, %r13 869_count_set_\@: 870 vmovdqa %xmm9, %xmm0 871 vmovq %xmm0, %rax 872 cmp $8, %r13 873 jle _less_than_8_bytes_left_\@ 874 875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 876 add $8, \DATA_OFFSET 877 psrldq $8, %xmm0 878 vmovq %xmm0, %rax 879 sub $8, %r13 880_less_than_8_bytes_left_\@: 881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 882 add $1, \DATA_OFFSET 883 shr $8, %rax 884 sub $1, %r13 885 jne _less_than_8_bytes_left_\@ 886_partial_block_done_\@: 887.endm # PARTIAL_BLOCK 888 889############################################################################### 890# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 891# Input: A and B (128-bits each, bit-reflected) 892# Output: C = A*B*x mod poly, (i.e. >>1 ) 893# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 894# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 895############################################################################### 896.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 897 898 vpshufd $0b01001110, \GH, \T2 899 vpshufd $0b01001110, \HK, \T3 900 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 901 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 902 903 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 904 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 905 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 906 vpxor \GH, \T2,\T2 907 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 908 909 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 910 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 911 vpxor \T3, \GH, \GH 912 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 913 914 #first phase of the reduction 915 vpslld $31, \GH, \T2 # packed right shifting << 31 916 vpslld $30, \GH, \T3 # packed right shifting shift << 30 917 vpslld $25, \GH, \T4 # packed right shifting shift << 25 918 919 vpxor \T3, \T2, \T2 # xor the shifted versions 920 vpxor \T4, \T2, \T2 921 922 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 923 924 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 925 vpxor \T2, \GH, \GH # first phase of the reduction complete 926 927 #second phase of the reduction 928 929 vpsrld $1,\GH, \T2 # packed left shifting >> 1 930 vpsrld $2,\GH, \T3 # packed left shifting >> 2 931 vpsrld $7,\GH, \T4 # packed left shifting >> 7 932 vpxor \T3, \T2, \T2 # xor the shifted versions 933 vpxor \T4, \T2, \T2 934 935 vpxor \T5, \T2, \T2 936 vpxor \T2, \GH, \GH 937 vpxor \T1, \GH, \GH # the result is in GH 938 939 940.endm 941 942.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 943 944 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 945 vmovdqa \HK, \T5 946 947 vpshufd $0b01001110, \T5, \T1 948 vpxor \T5, \T1, \T1 949 vmovdqu \T1, HashKey_k(arg2) 950 951 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 952 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 953 vpshufd $0b01001110, \T5, \T1 954 vpxor \T5, \T1, \T1 955 vmovdqu \T1, HashKey_2_k(arg2) 956 957 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 958 vmovdqu \T5, HashKey_3(arg2) 959 vpshufd $0b01001110, \T5, \T1 960 vpxor \T5, \T1, \T1 961 vmovdqu \T1, HashKey_3_k(arg2) 962 963 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 964 vmovdqu \T5, HashKey_4(arg2) 965 vpshufd $0b01001110, \T5, \T1 966 vpxor \T5, \T1, \T1 967 vmovdqu \T1, HashKey_4_k(arg2) 968 969 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 970 vmovdqu \T5, HashKey_5(arg2) 971 vpshufd $0b01001110, \T5, \T1 972 vpxor \T5, \T1, \T1 973 vmovdqu \T1, HashKey_5_k(arg2) 974 975 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 976 vmovdqu \T5, HashKey_6(arg2) 977 vpshufd $0b01001110, \T5, \T1 978 vpxor \T5, \T1, \T1 979 vmovdqu \T1, HashKey_6_k(arg2) 980 981 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 982 vmovdqu \T5, HashKey_7(arg2) 983 vpshufd $0b01001110, \T5, \T1 984 vpxor \T5, \T1, \T1 985 vmovdqu \T1, HashKey_7_k(arg2) 986 987 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 988 vmovdqu \T5, HashKey_8(arg2) 989 vpshufd $0b01001110, \T5, \T1 990 vpxor \T5, \T1, \T1 991 vmovdqu \T1, HashKey_8_k(arg2) 992 993.endm 994 995## if a = number of total plaintext bytes 996## b = floor(a/16) 997## num_initial_blocks = b mod 4# 998## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 999## r10, r11, r12, rax are clobbered 1000## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1001 1002.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 1003 i = (8-\num_initial_blocks) 1004 setreg 1005 vmovdqu AadHash(arg2), reg_i 1006 1007 # start AES for num_initial_blocks blocks 1008 vmovdqu CurCount(arg2), \CTR 1009 1010 i = (9-\num_initial_blocks) 1011 setreg 1012.rep \num_initial_blocks 1013 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1014 vmovdqa \CTR, reg_i 1015 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1016 i = (i+1) 1017 setreg 1018.endr 1019 1020 vmovdqa (arg1), \T_key 1021 i = (9-\num_initial_blocks) 1022 setreg 1023.rep \num_initial_blocks 1024 vpxor \T_key, reg_i, reg_i 1025 i = (i+1) 1026 setreg 1027.endr 1028 1029 j = 1 1030 setreg 1031.rep \REP 1032 vmovdqa 16*j(arg1), \T_key 1033 i = (9-\num_initial_blocks) 1034 setreg 1035.rep \num_initial_blocks 1036 vaesenc \T_key, reg_i, reg_i 1037 i = (i+1) 1038 setreg 1039.endr 1040 1041 j = (j+1) 1042 setreg 1043.endr 1044 1045 vmovdqa 16*j(arg1), \T_key 1046 i = (9-\num_initial_blocks) 1047 setreg 1048.rep \num_initial_blocks 1049 vaesenclast \T_key, reg_i, reg_i 1050 i = (i+1) 1051 setreg 1052.endr 1053 1054 i = (9-\num_initial_blocks) 1055 setreg 1056.rep \num_initial_blocks 1057 vmovdqu (arg4, %r11), \T1 1058 vpxor \T1, reg_i, reg_i 1059 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1060 add $16, %r11 1061.if \ENC_DEC == DEC 1062 vmovdqa \T1, reg_i 1063.endif 1064 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1065 i = (i+1) 1066 setreg 1067.endr 1068 1069 1070 i = (8-\num_initial_blocks) 1071 j = (9-\num_initial_blocks) 1072 setreg 1073 1074.rep \num_initial_blocks 1075 vpxor reg_i, reg_j, reg_j 1076 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1077 i = (i+1) 1078 j = (j+1) 1079 setreg 1080.endr 1081 # XMM8 has the combined result here 1082 1083 vmovdqa \XMM8, TMP1(%rsp) 1084 vmovdqa \XMM8, \T3 1085 1086 cmp $128, %r13 1087 jl _initial_blocks_done\@ # no need for precomputed constants 1088 1089############################################################################### 1090# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1091 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1092 vmovdqa \CTR, \XMM1 1093 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1094 1095 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1096 vmovdqa \CTR, \XMM2 1097 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1098 1099 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1100 vmovdqa \CTR, \XMM3 1101 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1102 1103 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1104 vmovdqa \CTR, \XMM4 1105 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1106 1107 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1108 vmovdqa \CTR, \XMM5 1109 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1110 1111 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1112 vmovdqa \CTR, \XMM6 1113 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1114 1115 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1116 vmovdqa \CTR, \XMM7 1117 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1118 1119 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1120 vmovdqa \CTR, \XMM8 1121 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1122 1123 vmovdqa (arg1), \T_key 1124 vpxor \T_key, \XMM1, \XMM1 1125 vpxor \T_key, \XMM2, \XMM2 1126 vpxor \T_key, \XMM3, \XMM3 1127 vpxor \T_key, \XMM4, \XMM4 1128 vpxor \T_key, \XMM5, \XMM5 1129 vpxor \T_key, \XMM6, \XMM6 1130 vpxor \T_key, \XMM7, \XMM7 1131 vpxor \T_key, \XMM8, \XMM8 1132 1133 i = 1 1134 setreg 1135.rep \REP # do REP rounds 1136 vmovdqa 16*i(arg1), \T_key 1137 vaesenc \T_key, \XMM1, \XMM1 1138 vaesenc \T_key, \XMM2, \XMM2 1139 vaesenc \T_key, \XMM3, \XMM3 1140 vaesenc \T_key, \XMM4, \XMM4 1141 vaesenc \T_key, \XMM5, \XMM5 1142 vaesenc \T_key, \XMM6, \XMM6 1143 vaesenc \T_key, \XMM7, \XMM7 1144 vaesenc \T_key, \XMM8, \XMM8 1145 i = (i+1) 1146 setreg 1147.endr 1148 1149 vmovdqa 16*i(arg1), \T_key 1150 vaesenclast \T_key, \XMM1, \XMM1 1151 vaesenclast \T_key, \XMM2, \XMM2 1152 vaesenclast \T_key, \XMM3, \XMM3 1153 vaesenclast \T_key, \XMM4, \XMM4 1154 vaesenclast \T_key, \XMM5, \XMM5 1155 vaesenclast \T_key, \XMM6, \XMM6 1156 vaesenclast \T_key, \XMM7, \XMM7 1157 vaesenclast \T_key, \XMM8, \XMM8 1158 1159 vmovdqu (arg4, %r11), \T1 1160 vpxor \T1, \XMM1, \XMM1 1161 vmovdqu \XMM1, (arg3 , %r11) 1162 .if \ENC_DEC == DEC 1163 vmovdqa \T1, \XMM1 1164 .endif 1165 1166 vmovdqu 16*1(arg4, %r11), \T1 1167 vpxor \T1, \XMM2, \XMM2 1168 vmovdqu \XMM2, 16*1(arg3 , %r11) 1169 .if \ENC_DEC == DEC 1170 vmovdqa \T1, \XMM2 1171 .endif 1172 1173 vmovdqu 16*2(arg4, %r11), \T1 1174 vpxor \T1, \XMM3, \XMM3 1175 vmovdqu \XMM3, 16*2(arg3 , %r11) 1176 .if \ENC_DEC == DEC 1177 vmovdqa \T1, \XMM3 1178 .endif 1179 1180 vmovdqu 16*3(arg4, %r11), \T1 1181 vpxor \T1, \XMM4, \XMM4 1182 vmovdqu \XMM4, 16*3(arg3 , %r11) 1183 .if \ENC_DEC == DEC 1184 vmovdqa \T1, \XMM4 1185 .endif 1186 1187 vmovdqu 16*4(arg4, %r11), \T1 1188 vpxor \T1, \XMM5, \XMM5 1189 vmovdqu \XMM5, 16*4(arg3 , %r11) 1190 .if \ENC_DEC == DEC 1191 vmovdqa \T1, \XMM5 1192 .endif 1193 1194 vmovdqu 16*5(arg4, %r11), \T1 1195 vpxor \T1, \XMM6, \XMM6 1196 vmovdqu \XMM6, 16*5(arg3 , %r11) 1197 .if \ENC_DEC == DEC 1198 vmovdqa \T1, \XMM6 1199 .endif 1200 1201 vmovdqu 16*6(arg4, %r11), \T1 1202 vpxor \T1, \XMM7, \XMM7 1203 vmovdqu \XMM7, 16*6(arg3 , %r11) 1204 .if \ENC_DEC == DEC 1205 vmovdqa \T1, \XMM7 1206 .endif 1207 1208 vmovdqu 16*7(arg4, %r11), \T1 1209 vpxor \T1, \XMM8, \XMM8 1210 vmovdqu \XMM8, 16*7(arg3 , %r11) 1211 .if \ENC_DEC == DEC 1212 vmovdqa \T1, \XMM8 1213 .endif 1214 1215 add $128, %r11 1216 1217 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1218 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1226 1227############################################################################### 1228 1229_initial_blocks_done\@: 1230 1231.endm 1232 1233# encrypt 8 blocks at a time 1234# ghash the 8 previously encrypted ciphertext blocks 1235# arg1, arg3, arg4 are used as pointers only, not modified 1236# r11 is the data offset value 1237.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1238 1239 vmovdqa \XMM1, \T2 1240 vmovdqa \XMM2, TMP2(%rsp) 1241 vmovdqa \XMM3, TMP3(%rsp) 1242 vmovdqa \XMM4, TMP4(%rsp) 1243 vmovdqa \XMM5, TMP5(%rsp) 1244 vmovdqa \XMM6, TMP6(%rsp) 1245 vmovdqa \XMM7, TMP7(%rsp) 1246 vmovdqa \XMM8, TMP8(%rsp) 1247 1248.if \loop_idx == in_order 1249 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1250 vpaddd ONE(%rip), \XMM1, \XMM2 1251 vpaddd ONE(%rip), \XMM2, \XMM3 1252 vpaddd ONE(%rip), \XMM3, \XMM4 1253 vpaddd ONE(%rip), \XMM4, \XMM5 1254 vpaddd ONE(%rip), \XMM5, \XMM6 1255 vpaddd ONE(%rip), \XMM6, \XMM7 1256 vpaddd ONE(%rip), \XMM7, \XMM8 1257 vmovdqa \XMM8, \CTR 1258 1259 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1260 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1261 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1262 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1263 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1264 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1265 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1266 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1267.else 1268 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1269 vpaddd ONEf(%rip), \XMM1, \XMM2 1270 vpaddd ONEf(%rip), \XMM2, \XMM3 1271 vpaddd ONEf(%rip), \XMM3, \XMM4 1272 vpaddd ONEf(%rip), \XMM4, \XMM5 1273 vpaddd ONEf(%rip), \XMM5, \XMM6 1274 vpaddd ONEf(%rip), \XMM6, \XMM7 1275 vpaddd ONEf(%rip), \XMM7, \XMM8 1276 vmovdqa \XMM8, \CTR 1277.endif 1278 1279 1280 ####################################################################### 1281 1282 vmovdqu (arg1), \T1 1283 vpxor \T1, \XMM1, \XMM1 1284 vpxor \T1, \XMM2, \XMM2 1285 vpxor \T1, \XMM3, \XMM3 1286 vpxor \T1, \XMM4, \XMM4 1287 vpxor \T1, \XMM5, \XMM5 1288 vpxor \T1, \XMM6, \XMM6 1289 vpxor \T1, \XMM7, \XMM7 1290 vpxor \T1, \XMM8, \XMM8 1291 1292 ####################################################################### 1293 1294 1295 1296 1297 1298 vmovdqu 16*1(arg1), \T1 1299 vaesenc \T1, \XMM1, \XMM1 1300 vaesenc \T1, \XMM2, \XMM2 1301 vaesenc \T1, \XMM3, \XMM3 1302 vaesenc \T1, \XMM4, \XMM4 1303 vaesenc \T1, \XMM5, \XMM5 1304 vaesenc \T1, \XMM6, \XMM6 1305 vaesenc \T1, \XMM7, \XMM7 1306 vaesenc \T1, \XMM8, \XMM8 1307 1308 vmovdqu 16*2(arg1), \T1 1309 vaesenc \T1, \XMM1, \XMM1 1310 vaesenc \T1, \XMM2, \XMM2 1311 vaesenc \T1, \XMM3, \XMM3 1312 vaesenc \T1, \XMM4, \XMM4 1313 vaesenc \T1, \XMM5, \XMM5 1314 vaesenc \T1, \XMM6, \XMM6 1315 vaesenc \T1, \XMM7, \XMM7 1316 vaesenc \T1, \XMM8, \XMM8 1317 1318 1319 ####################################################################### 1320 1321 vmovdqu HashKey_8(arg2), \T5 1322 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1323 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1324 1325 vpshufd $0b01001110, \T2, \T6 1326 vpxor \T2, \T6, \T6 1327 1328 vmovdqu HashKey_8_k(arg2), \T5 1329 vpclmulqdq $0x00, \T5, \T6, \T6 1330 1331 vmovdqu 16*3(arg1), \T1 1332 vaesenc \T1, \XMM1, \XMM1 1333 vaesenc \T1, \XMM2, \XMM2 1334 vaesenc \T1, \XMM3, \XMM3 1335 vaesenc \T1, \XMM4, \XMM4 1336 vaesenc \T1, \XMM5, \XMM5 1337 vaesenc \T1, \XMM6, \XMM6 1338 vaesenc \T1, \XMM7, \XMM7 1339 vaesenc \T1, \XMM8, \XMM8 1340 1341 vmovdqa TMP2(%rsp), \T1 1342 vmovdqu HashKey_7(arg2), \T5 1343 vpclmulqdq $0x11, \T5, \T1, \T3 1344 vpxor \T3, \T4, \T4 1345 vpclmulqdq $0x00, \T5, \T1, \T3 1346 vpxor \T3, \T7, \T7 1347 1348 vpshufd $0b01001110, \T1, \T3 1349 vpxor \T1, \T3, \T3 1350 vmovdqu HashKey_7_k(arg2), \T5 1351 vpclmulqdq $0x10, \T5, \T3, \T3 1352 vpxor \T3, \T6, \T6 1353 1354 vmovdqu 16*4(arg1), \T1 1355 vaesenc \T1, \XMM1, \XMM1 1356 vaesenc \T1, \XMM2, \XMM2 1357 vaesenc \T1, \XMM3, \XMM3 1358 vaesenc \T1, \XMM4, \XMM4 1359 vaesenc \T1, \XMM5, \XMM5 1360 vaesenc \T1, \XMM6, \XMM6 1361 vaesenc \T1, \XMM7, \XMM7 1362 vaesenc \T1, \XMM8, \XMM8 1363 1364 ####################################################################### 1365 1366 vmovdqa TMP3(%rsp), \T1 1367 vmovdqu HashKey_6(arg2), \T5 1368 vpclmulqdq $0x11, \T5, \T1, \T3 1369 vpxor \T3, \T4, \T4 1370 vpclmulqdq $0x00, \T5, \T1, \T3 1371 vpxor \T3, \T7, \T7 1372 1373 vpshufd $0b01001110, \T1, \T3 1374 vpxor \T1, \T3, \T3 1375 vmovdqu HashKey_6_k(arg2), \T5 1376 vpclmulqdq $0x10, \T5, \T3, \T3 1377 vpxor \T3, \T6, \T6 1378 1379 vmovdqu 16*5(arg1), \T1 1380 vaesenc \T1, \XMM1, \XMM1 1381 vaesenc \T1, \XMM2, \XMM2 1382 vaesenc \T1, \XMM3, \XMM3 1383 vaesenc \T1, \XMM4, \XMM4 1384 vaesenc \T1, \XMM5, \XMM5 1385 vaesenc \T1, \XMM6, \XMM6 1386 vaesenc \T1, \XMM7, \XMM7 1387 vaesenc \T1, \XMM8, \XMM8 1388 1389 vmovdqa TMP4(%rsp), \T1 1390 vmovdqu HashKey_5(arg2), \T5 1391 vpclmulqdq $0x11, \T5, \T1, \T3 1392 vpxor \T3, \T4, \T4 1393 vpclmulqdq $0x00, \T5, \T1, \T3 1394 vpxor \T3, \T7, \T7 1395 1396 vpshufd $0b01001110, \T1, \T3 1397 vpxor \T1, \T3, \T3 1398 vmovdqu HashKey_5_k(arg2), \T5 1399 vpclmulqdq $0x10, \T5, \T3, \T3 1400 vpxor \T3, \T6, \T6 1401 1402 vmovdqu 16*6(arg1), \T1 1403 vaesenc \T1, \XMM1, \XMM1 1404 vaesenc \T1, \XMM2, \XMM2 1405 vaesenc \T1, \XMM3, \XMM3 1406 vaesenc \T1, \XMM4, \XMM4 1407 vaesenc \T1, \XMM5, \XMM5 1408 vaesenc \T1, \XMM6, \XMM6 1409 vaesenc \T1, \XMM7, \XMM7 1410 vaesenc \T1, \XMM8, \XMM8 1411 1412 1413 vmovdqa TMP5(%rsp), \T1 1414 vmovdqu HashKey_4(arg2), \T5 1415 vpclmulqdq $0x11, \T5, \T1, \T3 1416 vpxor \T3, \T4, \T4 1417 vpclmulqdq $0x00, \T5, \T1, \T3 1418 vpxor \T3, \T7, \T7 1419 1420 vpshufd $0b01001110, \T1, \T3 1421 vpxor \T1, \T3, \T3 1422 vmovdqu HashKey_4_k(arg2), \T5 1423 vpclmulqdq $0x10, \T5, \T3, \T3 1424 vpxor \T3, \T6, \T6 1425 1426 vmovdqu 16*7(arg1), \T1 1427 vaesenc \T1, \XMM1, \XMM1 1428 vaesenc \T1, \XMM2, \XMM2 1429 vaesenc \T1, \XMM3, \XMM3 1430 vaesenc \T1, \XMM4, \XMM4 1431 vaesenc \T1, \XMM5, \XMM5 1432 vaesenc \T1, \XMM6, \XMM6 1433 vaesenc \T1, \XMM7, \XMM7 1434 vaesenc \T1, \XMM8, \XMM8 1435 1436 vmovdqa TMP6(%rsp), \T1 1437 vmovdqu HashKey_3(arg2), \T5 1438 vpclmulqdq $0x11, \T5, \T1, \T3 1439 vpxor \T3, \T4, \T4 1440 vpclmulqdq $0x00, \T5, \T1, \T3 1441 vpxor \T3, \T7, \T7 1442 1443 vpshufd $0b01001110, \T1, \T3 1444 vpxor \T1, \T3, \T3 1445 vmovdqu HashKey_3_k(arg2), \T5 1446 vpclmulqdq $0x10, \T5, \T3, \T3 1447 vpxor \T3, \T6, \T6 1448 1449 1450 vmovdqu 16*8(arg1), \T1 1451 vaesenc \T1, \XMM1, \XMM1 1452 vaesenc \T1, \XMM2, \XMM2 1453 vaesenc \T1, \XMM3, \XMM3 1454 vaesenc \T1, \XMM4, \XMM4 1455 vaesenc \T1, \XMM5, \XMM5 1456 vaesenc \T1, \XMM6, \XMM6 1457 vaesenc \T1, \XMM7, \XMM7 1458 vaesenc \T1, \XMM8, \XMM8 1459 1460 vmovdqa TMP7(%rsp), \T1 1461 vmovdqu HashKey_2(arg2), \T5 1462 vpclmulqdq $0x11, \T5, \T1, \T3 1463 vpxor \T3, \T4, \T4 1464 vpclmulqdq $0x00, \T5, \T1, \T3 1465 vpxor \T3, \T7, \T7 1466 1467 vpshufd $0b01001110, \T1, \T3 1468 vpxor \T1, \T3, \T3 1469 vmovdqu HashKey_2_k(arg2), \T5 1470 vpclmulqdq $0x10, \T5, \T3, \T3 1471 vpxor \T3, \T6, \T6 1472 1473 ####################################################################### 1474 1475 vmovdqu 16*9(arg1), \T5 1476 vaesenc \T5, \XMM1, \XMM1 1477 vaesenc \T5, \XMM2, \XMM2 1478 vaesenc \T5, \XMM3, \XMM3 1479 vaesenc \T5, \XMM4, \XMM4 1480 vaesenc \T5, \XMM5, \XMM5 1481 vaesenc \T5, \XMM6, \XMM6 1482 vaesenc \T5, \XMM7, \XMM7 1483 vaesenc \T5, \XMM8, \XMM8 1484 1485 vmovdqa TMP8(%rsp), \T1 1486 vmovdqu HashKey(arg2), \T5 1487 vpclmulqdq $0x11, \T5, \T1, \T3 1488 vpxor \T3, \T4, \T4 1489 vpclmulqdq $0x00, \T5, \T1, \T3 1490 vpxor \T3, \T7, \T7 1491 1492 vpshufd $0b01001110, \T1, \T3 1493 vpxor \T1, \T3, \T3 1494 vmovdqu HashKey_k(arg2), \T5 1495 vpclmulqdq $0x10, \T5, \T3, \T3 1496 vpxor \T3, \T6, \T6 1497 1498 vpxor \T4, \T6, \T6 1499 vpxor \T7, \T6, \T6 1500 1501 vmovdqu 16*10(arg1), \T5 1502 1503 i = 11 1504 setreg 1505.rep (\REP-9) 1506 1507 vaesenc \T5, \XMM1, \XMM1 1508 vaesenc \T5, \XMM2, \XMM2 1509 vaesenc \T5, \XMM3, \XMM3 1510 vaesenc \T5, \XMM4, \XMM4 1511 vaesenc \T5, \XMM5, \XMM5 1512 vaesenc \T5, \XMM6, \XMM6 1513 vaesenc \T5, \XMM7, \XMM7 1514 vaesenc \T5, \XMM8, \XMM8 1515 1516 vmovdqu 16*i(arg1), \T5 1517 i = i + 1 1518 setreg 1519.endr 1520 1521 i = 0 1522 j = 1 1523 setreg 1524.rep 8 1525 vpxor 16*i(arg4, %r11), \T5, \T2 1526 .if \ENC_DEC == ENC 1527 vaesenclast \T2, reg_j, reg_j 1528 .else 1529 vaesenclast \T2, reg_j, \T3 1530 vmovdqu 16*i(arg4, %r11), reg_j 1531 vmovdqu \T3, 16*i(arg3, %r11) 1532 .endif 1533 i = (i+1) 1534 j = (j+1) 1535 setreg 1536.endr 1537 ####################################################################### 1538 1539 1540 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1541 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1542 vpxor \T3, \T7, \T7 1543 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1544 1545 1546 1547 ####################################################################### 1548 #first phase of the reduction 1549 ####################################################################### 1550 vpslld $31, \T7, \T2 # packed right shifting << 31 1551 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1552 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1553 1554 vpxor \T3, \T2, \T2 # xor the shifted versions 1555 vpxor \T4, \T2, \T2 1556 1557 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1558 1559 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1560 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1561 ####################################################################### 1562 .if \ENC_DEC == ENC 1563 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1564 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1565 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1566 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1567 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1568 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1569 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1570 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1571 .endif 1572 1573 ####################################################################### 1574 #second phase of the reduction 1575 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1576 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1577 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1578 vpxor \T3, \T2, \T2 # xor the shifted versions 1579 vpxor \T4, \T2, \T2 1580 1581 vpxor \T1, \T2, \T2 1582 vpxor \T2, \T7, \T7 1583 vpxor \T7, \T6, \T6 # the result is in T6 1584 ####################################################################### 1585 1586 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1587 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1588 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1589 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1590 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1591 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1592 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1593 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1594 1595 1596 vpxor \T6, \XMM1, \XMM1 1597 1598 1599 1600.endm 1601 1602 1603# GHASH the last 4 ciphertext blocks. 1604.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1605 1606 ## Karatsuba Method 1607 1608 1609 vpshufd $0b01001110, \XMM1, \T2 1610 vpxor \XMM1, \T2, \T2 1611 vmovdqu HashKey_8(arg2), \T5 1612 vpclmulqdq $0x11, \T5, \XMM1, \T6 1613 vpclmulqdq $0x00, \T5, \XMM1, \T7 1614 1615 vmovdqu HashKey_8_k(arg2), \T3 1616 vpclmulqdq $0x00, \T3, \T2, \XMM1 1617 1618 ###################### 1619 1620 vpshufd $0b01001110, \XMM2, \T2 1621 vpxor \XMM2, \T2, \T2 1622 vmovdqu HashKey_7(arg2), \T5 1623 vpclmulqdq $0x11, \T5, \XMM2, \T4 1624 vpxor \T4, \T6, \T6 1625 1626 vpclmulqdq $0x00, \T5, \XMM2, \T4 1627 vpxor \T4, \T7, \T7 1628 1629 vmovdqu HashKey_7_k(arg2), \T3 1630 vpclmulqdq $0x00, \T3, \T2, \T2 1631 vpxor \T2, \XMM1, \XMM1 1632 1633 ###################### 1634 1635 vpshufd $0b01001110, \XMM3, \T2 1636 vpxor \XMM3, \T2, \T2 1637 vmovdqu HashKey_6(arg2), \T5 1638 vpclmulqdq $0x11, \T5, \XMM3, \T4 1639 vpxor \T4, \T6, \T6 1640 1641 vpclmulqdq $0x00, \T5, \XMM3, \T4 1642 vpxor \T4, \T7, \T7 1643 1644 vmovdqu HashKey_6_k(arg2), \T3 1645 vpclmulqdq $0x00, \T3, \T2, \T2 1646 vpxor \T2, \XMM1, \XMM1 1647 1648 ###################### 1649 1650 vpshufd $0b01001110, \XMM4, \T2 1651 vpxor \XMM4, \T2, \T2 1652 vmovdqu HashKey_5(arg2), \T5 1653 vpclmulqdq $0x11, \T5, \XMM4, \T4 1654 vpxor \T4, \T6, \T6 1655 1656 vpclmulqdq $0x00, \T5, \XMM4, \T4 1657 vpxor \T4, \T7, \T7 1658 1659 vmovdqu HashKey_5_k(arg2), \T3 1660 vpclmulqdq $0x00, \T3, \T2, \T2 1661 vpxor \T2, \XMM1, \XMM1 1662 1663 ###################### 1664 1665 vpshufd $0b01001110, \XMM5, \T2 1666 vpxor \XMM5, \T2, \T2 1667 vmovdqu HashKey_4(arg2), \T5 1668 vpclmulqdq $0x11, \T5, \XMM5, \T4 1669 vpxor \T4, \T6, \T6 1670 1671 vpclmulqdq $0x00, \T5, \XMM5, \T4 1672 vpxor \T4, \T7, \T7 1673 1674 vmovdqu HashKey_4_k(arg2), \T3 1675 vpclmulqdq $0x00, \T3, \T2, \T2 1676 vpxor \T2, \XMM1, \XMM1 1677 1678 ###################### 1679 1680 vpshufd $0b01001110, \XMM6, \T2 1681 vpxor \XMM6, \T2, \T2 1682 vmovdqu HashKey_3(arg2), \T5 1683 vpclmulqdq $0x11, \T5, \XMM6, \T4 1684 vpxor \T4, \T6, \T6 1685 1686 vpclmulqdq $0x00, \T5, \XMM6, \T4 1687 vpxor \T4, \T7, \T7 1688 1689 vmovdqu HashKey_3_k(arg2), \T3 1690 vpclmulqdq $0x00, \T3, \T2, \T2 1691 vpxor \T2, \XMM1, \XMM1 1692 1693 ###################### 1694 1695 vpshufd $0b01001110, \XMM7, \T2 1696 vpxor \XMM7, \T2, \T2 1697 vmovdqu HashKey_2(arg2), \T5 1698 vpclmulqdq $0x11, \T5, \XMM7, \T4 1699 vpxor \T4, \T6, \T6 1700 1701 vpclmulqdq $0x00, \T5, \XMM7, \T4 1702 vpxor \T4, \T7, \T7 1703 1704 vmovdqu HashKey_2_k(arg2), \T3 1705 vpclmulqdq $0x00, \T3, \T2, \T2 1706 vpxor \T2, \XMM1, \XMM1 1707 1708 ###################### 1709 1710 vpshufd $0b01001110, \XMM8, \T2 1711 vpxor \XMM8, \T2, \T2 1712 vmovdqu HashKey(arg2), \T5 1713 vpclmulqdq $0x11, \T5, \XMM8, \T4 1714 vpxor \T4, \T6, \T6 1715 1716 vpclmulqdq $0x00, \T5, \XMM8, \T4 1717 vpxor \T4, \T7, \T7 1718 1719 vmovdqu HashKey_k(arg2), \T3 1720 vpclmulqdq $0x00, \T3, \T2, \T2 1721 1722 vpxor \T2, \XMM1, \XMM1 1723 vpxor \T6, \XMM1, \XMM1 1724 vpxor \T7, \XMM1, \T2 1725 1726 1727 1728 1729 vpslldq $8, \T2, \T4 1730 vpsrldq $8, \T2, \T2 1731 1732 vpxor \T4, \T7, \T7 1733 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1734 # the accumulated carry-less multiplications 1735 1736 ####################################################################### 1737 #first phase of the reduction 1738 vpslld $31, \T7, \T2 # packed right shifting << 31 1739 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1740 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1741 1742 vpxor \T3, \T2, \T2 # xor the shifted versions 1743 vpxor \T4, \T2, \T2 1744 1745 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1746 1747 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1748 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1749 ####################################################################### 1750 1751 1752 #second phase of the reduction 1753 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1754 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1755 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1756 vpxor \T3, \T2, \T2 # xor the shifted versions 1757 vpxor \T4, \T2, \T2 1758 1759 vpxor \T1, \T2, \T2 1760 vpxor \T2, \T7, \T7 1761 vpxor \T7, \T6, \T6 # the result is in T6 1762 1763.endm 1764 1765############################################################# 1766#void aesni_gcm_precomp_avx_gen2 1767# (gcm_data *my_ctx_data, 1768# gcm_context_data *data, 1769# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1770# u8 *iv, /* Pre-counter block j0: 4 byte salt 1771# (from Security Association) concatenated with 8 byte 1772# Initialisation Vector (from IPSec ESP Payload) 1773# concatenated with 0x00000001. 16-byte aligned pointer. */ 1774# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1775# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1776############################################################# 1777SYM_FUNC_START(aesni_gcm_init_avx_gen2) 1778 FUNC_SAVE 1779 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1780 FUNC_RESTORE 1781 ret 1782SYM_FUNC_END(aesni_gcm_init_avx_gen2) 1783 1784############################################################################### 1785#void aesni_gcm_enc_update_avx_gen2( 1786# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1787# gcm_context_data *data, 1788# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1789# const u8 *in, /* Plaintext input */ 1790# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1791############################################################################### 1792SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) 1793 FUNC_SAVE 1794 mov keysize, %eax 1795 cmp $32, %eax 1796 je key_256_enc_update 1797 cmp $16, %eax 1798 je key_128_enc_update 1799 # must be 192 1800 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1801 FUNC_RESTORE 1802 ret 1803key_128_enc_update: 1804 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1805 FUNC_RESTORE 1806 ret 1807key_256_enc_update: 1808 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1809 FUNC_RESTORE 1810 ret 1811SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) 1812 1813############################################################################### 1814#void aesni_gcm_dec_update_avx_gen2( 1815# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1816# gcm_context_data *data, 1817# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1818# const u8 *in, /* Ciphertext input */ 1819# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1820############################################################################### 1821SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) 1822 FUNC_SAVE 1823 mov keysize,%eax 1824 cmp $32, %eax 1825 je key_256_dec_update 1826 cmp $16, %eax 1827 je key_128_dec_update 1828 # must be 192 1829 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1830 FUNC_RESTORE 1831 ret 1832key_128_dec_update: 1833 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1834 FUNC_RESTORE 1835 ret 1836key_256_dec_update: 1837 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1838 FUNC_RESTORE 1839 ret 1840SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) 1841 1842############################################################################### 1843#void aesni_gcm_finalize_avx_gen2( 1844# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1845# gcm_context_data *data, 1846# u8 *auth_tag, /* Authenticated Tag output. */ 1847# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1848# Valid values are 16 (most likely), 12 or 8. */ 1849############################################################################### 1850SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) 1851 FUNC_SAVE 1852 mov keysize,%eax 1853 cmp $32, %eax 1854 je key_256_finalize 1855 cmp $16, %eax 1856 je key_128_finalize 1857 # must be 192 1858 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1859 FUNC_RESTORE 1860 ret 1861key_128_finalize: 1862 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1863 FUNC_RESTORE 1864 ret 1865key_256_finalize: 1866 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1867 FUNC_RESTORE 1868 ret 1869SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) 1870 1871############################################################################### 1872# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1873# Input: A and B (128-bits each, bit-reflected) 1874# Output: C = A*B*x mod poly, (i.e. >>1 ) 1875# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1876# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1877############################################################################### 1878.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1879 1880 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1881 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1882 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1883 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1884 vpxor \T3, \GH, \GH 1885 1886 1887 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1888 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1889 1890 vpxor \T3, \T1, \T1 1891 vpxor \T2, \GH, \GH 1892 1893 ####################################################################### 1894 #first phase of the reduction 1895 vmovdqa POLY2(%rip), \T3 1896 1897 vpclmulqdq $0x01, \GH, \T3, \T2 1898 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1899 1900 vpxor \T2, \GH, \GH # first phase of the reduction complete 1901 ####################################################################### 1902 #second phase of the reduction 1903 vpclmulqdq $0x00, \GH, \T3, \T2 1904 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1905 1906 vpclmulqdq $0x10, \GH, \T3, \GH 1907 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1908 1909 vpxor \T2, \GH, \GH # second phase of the reduction complete 1910 ####################################################################### 1911 vpxor \T1, \GH, \GH # the result is in GH 1912 1913 1914.endm 1915 1916.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1917 1918 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1919 vmovdqa \HK, \T5 1920 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1921 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1922 1923 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1924 vmovdqu \T5, HashKey_3(arg2) 1925 1926 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1927 vmovdqu \T5, HashKey_4(arg2) 1928 1929 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1930 vmovdqu \T5, HashKey_5(arg2) 1931 1932 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1933 vmovdqu \T5, HashKey_6(arg2) 1934 1935 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1936 vmovdqu \T5, HashKey_7(arg2) 1937 1938 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1939 vmovdqu \T5, HashKey_8(arg2) 1940 1941.endm 1942 1943## if a = number of total plaintext bytes 1944## b = floor(a/16) 1945## num_initial_blocks = b mod 4# 1946## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1947## r10, r11, r12, rax are clobbered 1948## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1949 1950.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1951 i = (8-\num_initial_blocks) 1952 setreg 1953 vmovdqu AadHash(arg2), reg_i 1954 1955 # start AES for num_initial_blocks blocks 1956 vmovdqu CurCount(arg2), \CTR 1957 1958 i = (9-\num_initial_blocks) 1959 setreg 1960.rep \num_initial_blocks 1961 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1962 vmovdqa \CTR, reg_i 1963 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1964 i = (i+1) 1965 setreg 1966.endr 1967 1968 vmovdqa (arg1), \T_key 1969 i = (9-\num_initial_blocks) 1970 setreg 1971.rep \num_initial_blocks 1972 vpxor \T_key, reg_i, reg_i 1973 i = (i+1) 1974 setreg 1975.endr 1976 1977 j = 1 1978 setreg 1979.rep \REP 1980 vmovdqa 16*j(arg1), \T_key 1981 i = (9-\num_initial_blocks) 1982 setreg 1983.rep \num_initial_blocks 1984 vaesenc \T_key, reg_i, reg_i 1985 i = (i+1) 1986 setreg 1987.endr 1988 1989 j = (j+1) 1990 setreg 1991.endr 1992 1993 1994 vmovdqa 16*j(arg1), \T_key 1995 i = (9-\num_initial_blocks) 1996 setreg 1997.rep \num_initial_blocks 1998 vaesenclast \T_key, reg_i, reg_i 1999 i = (i+1) 2000 setreg 2001.endr 2002 2003 i = (9-\num_initial_blocks) 2004 setreg 2005.rep \num_initial_blocks 2006 vmovdqu (arg4, %r11), \T1 2007 vpxor \T1, reg_i, reg_i 2008 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 2009 # num_initial_blocks blocks 2010 add $16, %r11 2011.if \ENC_DEC == DEC 2012 vmovdqa \T1, reg_i 2013.endif 2014 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 2015 i = (i+1) 2016 setreg 2017.endr 2018 2019 2020 i = (8-\num_initial_blocks) 2021 j = (9-\num_initial_blocks) 2022 setreg 2023 2024.rep \num_initial_blocks 2025 vpxor reg_i, reg_j, reg_j 2026 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 2027 i = (i+1) 2028 j = (j+1) 2029 setreg 2030.endr 2031 # XMM8 has the combined result here 2032 2033 vmovdqa \XMM8, TMP1(%rsp) 2034 vmovdqa \XMM8, \T3 2035 2036 cmp $128, %r13 2037 jl _initial_blocks_done\@ # no need for precomputed constants 2038 2039############################################################################### 2040# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2041 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2042 vmovdqa \CTR, \XMM1 2043 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2044 2045 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2046 vmovdqa \CTR, \XMM2 2047 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2048 2049 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2050 vmovdqa \CTR, \XMM3 2051 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2052 2053 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2054 vmovdqa \CTR, \XMM4 2055 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2056 2057 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2058 vmovdqa \CTR, \XMM5 2059 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2060 2061 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2062 vmovdqa \CTR, \XMM6 2063 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2064 2065 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2066 vmovdqa \CTR, \XMM7 2067 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2068 2069 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2070 vmovdqa \CTR, \XMM8 2071 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2072 2073 vmovdqa (arg1), \T_key 2074 vpxor \T_key, \XMM1, \XMM1 2075 vpxor \T_key, \XMM2, \XMM2 2076 vpxor \T_key, \XMM3, \XMM3 2077 vpxor \T_key, \XMM4, \XMM4 2078 vpxor \T_key, \XMM5, \XMM5 2079 vpxor \T_key, \XMM6, \XMM6 2080 vpxor \T_key, \XMM7, \XMM7 2081 vpxor \T_key, \XMM8, \XMM8 2082 2083 i = 1 2084 setreg 2085.rep \REP # do REP rounds 2086 vmovdqa 16*i(arg1), \T_key 2087 vaesenc \T_key, \XMM1, \XMM1 2088 vaesenc \T_key, \XMM2, \XMM2 2089 vaesenc \T_key, \XMM3, \XMM3 2090 vaesenc \T_key, \XMM4, \XMM4 2091 vaesenc \T_key, \XMM5, \XMM5 2092 vaesenc \T_key, \XMM6, \XMM6 2093 vaesenc \T_key, \XMM7, \XMM7 2094 vaesenc \T_key, \XMM8, \XMM8 2095 i = (i+1) 2096 setreg 2097.endr 2098 2099 2100 vmovdqa 16*i(arg1), \T_key 2101 vaesenclast \T_key, \XMM1, \XMM1 2102 vaesenclast \T_key, \XMM2, \XMM2 2103 vaesenclast \T_key, \XMM3, \XMM3 2104 vaesenclast \T_key, \XMM4, \XMM4 2105 vaesenclast \T_key, \XMM5, \XMM5 2106 vaesenclast \T_key, \XMM6, \XMM6 2107 vaesenclast \T_key, \XMM7, \XMM7 2108 vaesenclast \T_key, \XMM8, \XMM8 2109 2110 vmovdqu (arg4, %r11), \T1 2111 vpxor \T1, \XMM1, \XMM1 2112 vmovdqu \XMM1, (arg3 , %r11) 2113 .if \ENC_DEC == DEC 2114 vmovdqa \T1, \XMM1 2115 .endif 2116 2117 vmovdqu 16*1(arg4, %r11), \T1 2118 vpxor \T1, \XMM2, \XMM2 2119 vmovdqu \XMM2, 16*1(arg3 , %r11) 2120 .if \ENC_DEC == DEC 2121 vmovdqa \T1, \XMM2 2122 .endif 2123 2124 vmovdqu 16*2(arg4, %r11), \T1 2125 vpxor \T1, \XMM3, \XMM3 2126 vmovdqu \XMM3, 16*2(arg3 , %r11) 2127 .if \ENC_DEC == DEC 2128 vmovdqa \T1, \XMM3 2129 .endif 2130 2131 vmovdqu 16*3(arg4, %r11), \T1 2132 vpxor \T1, \XMM4, \XMM4 2133 vmovdqu \XMM4, 16*3(arg3 , %r11) 2134 .if \ENC_DEC == DEC 2135 vmovdqa \T1, \XMM4 2136 .endif 2137 2138 vmovdqu 16*4(arg4, %r11), \T1 2139 vpxor \T1, \XMM5, \XMM5 2140 vmovdqu \XMM5, 16*4(arg3 , %r11) 2141 .if \ENC_DEC == DEC 2142 vmovdqa \T1, \XMM5 2143 .endif 2144 2145 vmovdqu 16*5(arg4, %r11), \T1 2146 vpxor \T1, \XMM6, \XMM6 2147 vmovdqu \XMM6, 16*5(arg3 , %r11) 2148 .if \ENC_DEC == DEC 2149 vmovdqa \T1, \XMM6 2150 .endif 2151 2152 vmovdqu 16*6(arg4, %r11), \T1 2153 vpxor \T1, \XMM7, \XMM7 2154 vmovdqu \XMM7, 16*6(arg3 , %r11) 2155 .if \ENC_DEC == DEC 2156 vmovdqa \T1, \XMM7 2157 .endif 2158 2159 vmovdqu 16*7(arg4, %r11), \T1 2160 vpxor \T1, \XMM8, \XMM8 2161 vmovdqu \XMM8, 16*7(arg3 , %r11) 2162 .if \ENC_DEC == DEC 2163 vmovdqa \T1, \XMM8 2164 .endif 2165 2166 add $128, %r11 2167 2168 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2169 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2170 # the corresponding ciphertext 2171 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2172 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2173 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2174 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2175 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2176 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2177 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2178 2179############################################################################### 2180 2181_initial_blocks_done\@: 2182 2183 2184.endm 2185 2186 2187 2188# encrypt 8 blocks at a time 2189# ghash the 8 previously encrypted ciphertext blocks 2190# arg1, arg3, arg4 are used as pointers only, not modified 2191# r11 is the data offset value 2192.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2193 2194 vmovdqa \XMM1, \T2 2195 vmovdqa \XMM2, TMP2(%rsp) 2196 vmovdqa \XMM3, TMP3(%rsp) 2197 vmovdqa \XMM4, TMP4(%rsp) 2198 vmovdqa \XMM5, TMP5(%rsp) 2199 vmovdqa \XMM6, TMP6(%rsp) 2200 vmovdqa \XMM7, TMP7(%rsp) 2201 vmovdqa \XMM8, TMP8(%rsp) 2202 2203.if \loop_idx == in_order 2204 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2205 vpaddd ONE(%rip), \XMM1, \XMM2 2206 vpaddd ONE(%rip), \XMM2, \XMM3 2207 vpaddd ONE(%rip), \XMM3, \XMM4 2208 vpaddd ONE(%rip), \XMM4, \XMM5 2209 vpaddd ONE(%rip), \XMM5, \XMM6 2210 vpaddd ONE(%rip), \XMM6, \XMM7 2211 vpaddd ONE(%rip), \XMM7, \XMM8 2212 vmovdqa \XMM8, \CTR 2213 2214 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2215 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2216 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2217 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2218 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2219 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2220 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2221 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2222.else 2223 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2224 vpaddd ONEf(%rip), \XMM1, \XMM2 2225 vpaddd ONEf(%rip), \XMM2, \XMM3 2226 vpaddd ONEf(%rip), \XMM3, \XMM4 2227 vpaddd ONEf(%rip), \XMM4, \XMM5 2228 vpaddd ONEf(%rip), \XMM5, \XMM6 2229 vpaddd ONEf(%rip), \XMM6, \XMM7 2230 vpaddd ONEf(%rip), \XMM7, \XMM8 2231 vmovdqa \XMM8, \CTR 2232.endif 2233 2234 2235 ####################################################################### 2236 2237 vmovdqu (arg1), \T1 2238 vpxor \T1, \XMM1, \XMM1 2239 vpxor \T1, \XMM2, \XMM2 2240 vpxor \T1, \XMM3, \XMM3 2241 vpxor \T1, \XMM4, \XMM4 2242 vpxor \T1, \XMM5, \XMM5 2243 vpxor \T1, \XMM6, \XMM6 2244 vpxor \T1, \XMM7, \XMM7 2245 vpxor \T1, \XMM8, \XMM8 2246 2247 ####################################################################### 2248 2249 2250 2251 2252 2253 vmovdqu 16*1(arg1), \T1 2254 vaesenc \T1, \XMM1, \XMM1 2255 vaesenc \T1, \XMM2, \XMM2 2256 vaesenc \T1, \XMM3, \XMM3 2257 vaesenc \T1, \XMM4, \XMM4 2258 vaesenc \T1, \XMM5, \XMM5 2259 vaesenc \T1, \XMM6, \XMM6 2260 vaesenc \T1, \XMM7, \XMM7 2261 vaesenc \T1, \XMM8, \XMM8 2262 2263 vmovdqu 16*2(arg1), \T1 2264 vaesenc \T1, \XMM1, \XMM1 2265 vaesenc \T1, \XMM2, \XMM2 2266 vaesenc \T1, \XMM3, \XMM3 2267 vaesenc \T1, \XMM4, \XMM4 2268 vaesenc \T1, \XMM5, \XMM5 2269 vaesenc \T1, \XMM6, \XMM6 2270 vaesenc \T1, \XMM7, \XMM7 2271 vaesenc \T1, \XMM8, \XMM8 2272 2273 2274 ####################################################################### 2275 2276 vmovdqu HashKey_8(arg2), \T5 2277 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2278 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2279 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2280 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2281 vpxor \T5, \T6, \T6 2282 2283 vmovdqu 16*3(arg1), \T1 2284 vaesenc \T1, \XMM1, \XMM1 2285 vaesenc \T1, \XMM2, \XMM2 2286 vaesenc \T1, \XMM3, \XMM3 2287 vaesenc \T1, \XMM4, \XMM4 2288 vaesenc \T1, \XMM5, \XMM5 2289 vaesenc \T1, \XMM6, \XMM6 2290 vaesenc \T1, \XMM7, \XMM7 2291 vaesenc \T1, \XMM8, \XMM8 2292 2293 vmovdqa TMP2(%rsp), \T1 2294 vmovdqu HashKey_7(arg2), \T5 2295 vpclmulqdq $0x11, \T5, \T1, \T3 2296 vpxor \T3, \T4, \T4 2297 2298 vpclmulqdq $0x00, \T5, \T1, \T3 2299 vpxor \T3, \T7, \T7 2300 2301 vpclmulqdq $0x01, \T5, \T1, \T3 2302 vpxor \T3, \T6, \T6 2303 2304 vpclmulqdq $0x10, \T5, \T1, \T3 2305 vpxor \T3, \T6, \T6 2306 2307 vmovdqu 16*4(arg1), \T1 2308 vaesenc \T1, \XMM1, \XMM1 2309 vaesenc \T1, \XMM2, \XMM2 2310 vaesenc \T1, \XMM3, \XMM3 2311 vaesenc \T1, \XMM4, \XMM4 2312 vaesenc \T1, \XMM5, \XMM5 2313 vaesenc \T1, \XMM6, \XMM6 2314 vaesenc \T1, \XMM7, \XMM7 2315 vaesenc \T1, \XMM8, \XMM8 2316 2317 ####################################################################### 2318 2319 vmovdqa TMP3(%rsp), \T1 2320 vmovdqu HashKey_6(arg2), \T5 2321 vpclmulqdq $0x11, \T5, \T1, \T3 2322 vpxor \T3, \T4, \T4 2323 2324 vpclmulqdq $0x00, \T5, \T1, \T3 2325 vpxor \T3, \T7, \T7 2326 2327 vpclmulqdq $0x01, \T5, \T1, \T3 2328 vpxor \T3, \T6, \T6 2329 2330 vpclmulqdq $0x10, \T5, \T1, \T3 2331 vpxor \T3, \T6, \T6 2332 2333 vmovdqu 16*5(arg1), \T1 2334 vaesenc \T1, \XMM1, \XMM1 2335 vaesenc \T1, \XMM2, \XMM2 2336 vaesenc \T1, \XMM3, \XMM3 2337 vaesenc \T1, \XMM4, \XMM4 2338 vaesenc \T1, \XMM5, \XMM5 2339 vaesenc \T1, \XMM6, \XMM6 2340 vaesenc \T1, \XMM7, \XMM7 2341 vaesenc \T1, \XMM8, \XMM8 2342 2343 vmovdqa TMP4(%rsp), \T1 2344 vmovdqu HashKey_5(arg2), \T5 2345 vpclmulqdq $0x11, \T5, \T1, \T3 2346 vpxor \T3, \T4, \T4 2347 2348 vpclmulqdq $0x00, \T5, \T1, \T3 2349 vpxor \T3, \T7, \T7 2350 2351 vpclmulqdq $0x01, \T5, \T1, \T3 2352 vpxor \T3, \T6, \T6 2353 2354 vpclmulqdq $0x10, \T5, \T1, \T3 2355 vpxor \T3, \T6, \T6 2356 2357 vmovdqu 16*6(arg1), \T1 2358 vaesenc \T1, \XMM1, \XMM1 2359 vaesenc \T1, \XMM2, \XMM2 2360 vaesenc \T1, \XMM3, \XMM3 2361 vaesenc \T1, \XMM4, \XMM4 2362 vaesenc \T1, \XMM5, \XMM5 2363 vaesenc \T1, \XMM6, \XMM6 2364 vaesenc \T1, \XMM7, \XMM7 2365 vaesenc \T1, \XMM8, \XMM8 2366 2367 2368 vmovdqa TMP5(%rsp), \T1 2369 vmovdqu HashKey_4(arg2), \T5 2370 vpclmulqdq $0x11, \T5, \T1, \T3 2371 vpxor \T3, \T4, \T4 2372 2373 vpclmulqdq $0x00, \T5, \T1, \T3 2374 vpxor \T3, \T7, \T7 2375 2376 vpclmulqdq $0x01, \T5, \T1, \T3 2377 vpxor \T3, \T6, \T6 2378 2379 vpclmulqdq $0x10, \T5, \T1, \T3 2380 vpxor \T3, \T6, \T6 2381 2382 vmovdqu 16*7(arg1), \T1 2383 vaesenc \T1, \XMM1, \XMM1 2384 vaesenc \T1, \XMM2, \XMM2 2385 vaesenc \T1, \XMM3, \XMM3 2386 vaesenc \T1, \XMM4, \XMM4 2387 vaesenc \T1, \XMM5, \XMM5 2388 vaesenc \T1, \XMM6, \XMM6 2389 vaesenc \T1, \XMM7, \XMM7 2390 vaesenc \T1, \XMM8, \XMM8 2391 2392 vmovdqa TMP6(%rsp), \T1 2393 vmovdqu HashKey_3(arg2), \T5 2394 vpclmulqdq $0x11, \T5, \T1, \T3 2395 vpxor \T3, \T4, \T4 2396 2397 vpclmulqdq $0x00, \T5, \T1, \T3 2398 vpxor \T3, \T7, \T7 2399 2400 vpclmulqdq $0x01, \T5, \T1, \T3 2401 vpxor \T3, \T6, \T6 2402 2403 vpclmulqdq $0x10, \T5, \T1, \T3 2404 vpxor \T3, \T6, \T6 2405 2406 vmovdqu 16*8(arg1), \T1 2407 vaesenc \T1, \XMM1, \XMM1 2408 vaesenc \T1, \XMM2, \XMM2 2409 vaesenc \T1, \XMM3, \XMM3 2410 vaesenc \T1, \XMM4, \XMM4 2411 vaesenc \T1, \XMM5, \XMM5 2412 vaesenc \T1, \XMM6, \XMM6 2413 vaesenc \T1, \XMM7, \XMM7 2414 vaesenc \T1, \XMM8, \XMM8 2415 2416 vmovdqa TMP7(%rsp), \T1 2417 vmovdqu HashKey_2(arg2), \T5 2418 vpclmulqdq $0x11, \T5, \T1, \T3 2419 vpxor \T3, \T4, \T4 2420 2421 vpclmulqdq $0x00, \T5, \T1, \T3 2422 vpxor \T3, \T7, \T7 2423 2424 vpclmulqdq $0x01, \T5, \T1, \T3 2425 vpxor \T3, \T6, \T6 2426 2427 vpclmulqdq $0x10, \T5, \T1, \T3 2428 vpxor \T3, \T6, \T6 2429 2430 2431 ####################################################################### 2432 2433 vmovdqu 16*9(arg1), \T5 2434 vaesenc \T5, \XMM1, \XMM1 2435 vaesenc \T5, \XMM2, \XMM2 2436 vaesenc \T5, \XMM3, \XMM3 2437 vaesenc \T5, \XMM4, \XMM4 2438 vaesenc \T5, \XMM5, \XMM5 2439 vaesenc \T5, \XMM6, \XMM6 2440 vaesenc \T5, \XMM7, \XMM7 2441 vaesenc \T5, \XMM8, \XMM8 2442 2443 vmovdqa TMP8(%rsp), \T1 2444 vmovdqu HashKey(arg2), \T5 2445 2446 vpclmulqdq $0x00, \T5, \T1, \T3 2447 vpxor \T3, \T7, \T7 2448 2449 vpclmulqdq $0x01, \T5, \T1, \T3 2450 vpxor \T3, \T6, \T6 2451 2452 vpclmulqdq $0x10, \T5, \T1, \T3 2453 vpxor \T3, \T6, \T6 2454 2455 vpclmulqdq $0x11, \T5, \T1, \T3 2456 vpxor \T3, \T4, \T1 2457 2458 2459 vmovdqu 16*10(arg1), \T5 2460 2461 i = 11 2462 setreg 2463.rep (\REP-9) 2464 vaesenc \T5, \XMM1, \XMM1 2465 vaesenc \T5, \XMM2, \XMM2 2466 vaesenc \T5, \XMM3, \XMM3 2467 vaesenc \T5, \XMM4, \XMM4 2468 vaesenc \T5, \XMM5, \XMM5 2469 vaesenc \T5, \XMM6, \XMM6 2470 vaesenc \T5, \XMM7, \XMM7 2471 vaesenc \T5, \XMM8, \XMM8 2472 2473 vmovdqu 16*i(arg1), \T5 2474 i = i + 1 2475 setreg 2476.endr 2477 2478 i = 0 2479 j = 1 2480 setreg 2481.rep 8 2482 vpxor 16*i(arg4, %r11), \T5, \T2 2483 .if \ENC_DEC == ENC 2484 vaesenclast \T2, reg_j, reg_j 2485 .else 2486 vaesenclast \T2, reg_j, \T3 2487 vmovdqu 16*i(arg4, %r11), reg_j 2488 vmovdqu \T3, 16*i(arg3, %r11) 2489 .endif 2490 i = (i+1) 2491 j = (j+1) 2492 setreg 2493.endr 2494 ####################################################################### 2495 2496 2497 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2498 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2499 vpxor \T3, \T7, \T7 2500 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2501 2502 2503 2504 ####################################################################### 2505 #first phase of the reduction 2506 vmovdqa POLY2(%rip), \T3 2507 2508 vpclmulqdq $0x01, \T7, \T3, \T2 2509 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2510 2511 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2512 ####################################################################### 2513 .if \ENC_DEC == ENC 2514 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2515 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2516 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2517 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2518 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2519 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2520 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2521 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2522 .endif 2523 2524 ####################################################################### 2525 #second phase of the reduction 2526 vpclmulqdq $0x00, \T7, \T3, \T2 2527 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2528 2529 vpclmulqdq $0x10, \T7, \T3, \T4 2530 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2531 2532 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2533 ####################################################################### 2534 vpxor \T4, \T1, \T1 # the result is in T1 2535 2536 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2537 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2538 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2539 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2540 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2541 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2542 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2543 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2544 2545 2546 vpxor \T1, \XMM1, \XMM1 2547 2548 2549 2550.endm 2551 2552 2553# GHASH the last 4 ciphertext blocks. 2554.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2555 2556 ## Karatsuba Method 2557 2558 vmovdqu HashKey_8(arg2), \T5 2559 2560 vpshufd $0b01001110, \XMM1, \T2 2561 vpshufd $0b01001110, \T5, \T3 2562 vpxor \XMM1, \T2, \T2 2563 vpxor \T5, \T3, \T3 2564 2565 vpclmulqdq $0x11, \T5, \XMM1, \T6 2566 vpclmulqdq $0x00, \T5, \XMM1, \T7 2567 2568 vpclmulqdq $0x00, \T3, \T2, \XMM1 2569 2570 ###################### 2571 2572 vmovdqu HashKey_7(arg2), \T5 2573 vpshufd $0b01001110, \XMM2, \T2 2574 vpshufd $0b01001110, \T5, \T3 2575 vpxor \XMM2, \T2, \T2 2576 vpxor \T5, \T3, \T3 2577 2578 vpclmulqdq $0x11, \T5, \XMM2, \T4 2579 vpxor \T4, \T6, \T6 2580 2581 vpclmulqdq $0x00, \T5, \XMM2, \T4 2582 vpxor \T4, \T7, \T7 2583 2584 vpclmulqdq $0x00, \T3, \T2, \T2 2585 2586 vpxor \T2, \XMM1, \XMM1 2587 2588 ###################### 2589 2590 vmovdqu HashKey_6(arg2), \T5 2591 vpshufd $0b01001110, \XMM3, \T2 2592 vpshufd $0b01001110, \T5, \T3 2593 vpxor \XMM3, \T2, \T2 2594 vpxor \T5, \T3, \T3 2595 2596 vpclmulqdq $0x11, \T5, \XMM3, \T4 2597 vpxor \T4, \T6, \T6 2598 2599 vpclmulqdq $0x00, \T5, \XMM3, \T4 2600 vpxor \T4, \T7, \T7 2601 2602 vpclmulqdq $0x00, \T3, \T2, \T2 2603 2604 vpxor \T2, \XMM1, \XMM1 2605 2606 ###################### 2607 2608 vmovdqu HashKey_5(arg2), \T5 2609 vpshufd $0b01001110, \XMM4, \T2 2610 vpshufd $0b01001110, \T5, \T3 2611 vpxor \XMM4, \T2, \T2 2612 vpxor \T5, \T3, \T3 2613 2614 vpclmulqdq $0x11, \T5, \XMM4, \T4 2615 vpxor \T4, \T6, \T6 2616 2617 vpclmulqdq $0x00, \T5, \XMM4, \T4 2618 vpxor \T4, \T7, \T7 2619 2620 vpclmulqdq $0x00, \T3, \T2, \T2 2621 2622 vpxor \T2, \XMM1, \XMM1 2623 2624 ###################### 2625 2626 vmovdqu HashKey_4(arg2), \T5 2627 vpshufd $0b01001110, \XMM5, \T2 2628 vpshufd $0b01001110, \T5, \T3 2629 vpxor \XMM5, \T2, \T2 2630 vpxor \T5, \T3, \T3 2631 2632 vpclmulqdq $0x11, \T5, \XMM5, \T4 2633 vpxor \T4, \T6, \T6 2634 2635 vpclmulqdq $0x00, \T5, \XMM5, \T4 2636 vpxor \T4, \T7, \T7 2637 2638 vpclmulqdq $0x00, \T3, \T2, \T2 2639 2640 vpxor \T2, \XMM1, \XMM1 2641 2642 ###################### 2643 2644 vmovdqu HashKey_3(arg2), \T5 2645 vpshufd $0b01001110, \XMM6, \T2 2646 vpshufd $0b01001110, \T5, \T3 2647 vpxor \XMM6, \T2, \T2 2648 vpxor \T5, \T3, \T3 2649 2650 vpclmulqdq $0x11, \T5, \XMM6, \T4 2651 vpxor \T4, \T6, \T6 2652 2653 vpclmulqdq $0x00, \T5, \XMM6, \T4 2654 vpxor \T4, \T7, \T7 2655 2656 vpclmulqdq $0x00, \T3, \T2, \T2 2657 2658 vpxor \T2, \XMM1, \XMM1 2659 2660 ###################### 2661 2662 vmovdqu HashKey_2(arg2), \T5 2663 vpshufd $0b01001110, \XMM7, \T2 2664 vpshufd $0b01001110, \T5, \T3 2665 vpxor \XMM7, \T2, \T2 2666 vpxor \T5, \T3, \T3 2667 2668 vpclmulqdq $0x11, \T5, \XMM7, \T4 2669 vpxor \T4, \T6, \T6 2670 2671 vpclmulqdq $0x00, \T5, \XMM7, \T4 2672 vpxor \T4, \T7, \T7 2673 2674 vpclmulqdq $0x00, \T3, \T2, \T2 2675 2676 vpxor \T2, \XMM1, \XMM1 2677 2678 ###################### 2679 2680 vmovdqu HashKey(arg2), \T5 2681 vpshufd $0b01001110, \XMM8, \T2 2682 vpshufd $0b01001110, \T5, \T3 2683 vpxor \XMM8, \T2, \T2 2684 vpxor \T5, \T3, \T3 2685 2686 vpclmulqdq $0x11, \T5, \XMM8, \T4 2687 vpxor \T4, \T6, \T6 2688 2689 vpclmulqdq $0x00, \T5, \XMM8, \T4 2690 vpxor \T4, \T7, \T7 2691 2692 vpclmulqdq $0x00, \T3, \T2, \T2 2693 2694 vpxor \T2, \XMM1, \XMM1 2695 vpxor \T6, \XMM1, \XMM1 2696 vpxor \T7, \XMM1, \T2 2697 2698 2699 2700 2701 vpslldq $8, \T2, \T4 2702 vpsrldq $8, \T2, \T2 2703 2704 vpxor \T4, \T7, \T7 2705 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2706 # accumulated carry-less multiplications 2707 2708 ####################################################################### 2709 #first phase of the reduction 2710 vmovdqa POLY2(%rip), \T3 2711 2712 vpclmulqdq $0x01, \T7, \T3, \T2 2713 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2714 2715 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2716 ####################################################################### 2717 2718 2719 #second phase of the reduction 2720 vpclmulqdq $0x00, \T7, \T3, \T2 2721 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2722 2723 vpclmulqdq $0x10, \T7, \T3, \T4 2724 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2725 2726 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2727 ####################################################################### 2728 vpxor \T4, \T6, \T6 # the result is in T6 2729.endm 2730 2731 2732 2733############################################################# 2734#void aesni_gcm_init_avx_gen4 2735# (gcm_data *my_ctx_data, 2736# gcm_context_data *data, 2737# u8 *iv, /* Pre-counter block j0: 4 byte salt 2738# (from Security Association) concatenated with 8 byte 2739# Initialisation Vector (from IPSec ESP Payload) 2740# concatenated with 0x00000001. 16-byte aligned pointer. */ 2741# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2742# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2743# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2744############################################################# 2745SYM_FUNC_START(aesni_gcm_init_avx_gen4) 2746 FUNC_SAVE 2747 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2748 FUNC_RESTORE 2749 ret 2750SYM_FUNC_END(aesni_gcm_init_avx_gen4) 2751 2752############################################################################### 2753#void aesni_gcm_enc_avx_gen4( 2754# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2755# gcm_context_data *data, 2756# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2757# const u8 *in, /* Plaintext input */ 2758# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2759############################################################################### 2760SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) 2761 FUNC_SAVE 2762 mov keysize,%eax 2763 cmp $32, %eax 2764 je key_256_enc_update4 2765 cmp $16, %eax 2766 je key_128_enc_update4 2767 # must be 192 2768 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2769 FUNC_RESTORE 2770 ret 2771key_128_enc_update4: 2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2773 FUNC_RESTORE 2774 ret 2775key_256_enc_update4: 2776 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2777 FUNC_RESTORE 2778 ret 2779SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) 2780 2781############################################################################### 2782#void aesni_gcm_dec_update_avx_gen4( 2783# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2784# gcm_context_data *data, 2785# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2786# const u8 *in, /* Ciphertext input */ 2787# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2788############################################################################### 2789SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) 2790 FUNC_SAVE 2791 mov keysize,%eax 2792 cmp $32, %eax 2793 je key_256_dec_update4 2794 cmp $16, %eax 2795 je key_128_dec_update4 2796 # must be 192 2797 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2798 FUNC_RESTORE 2799 ret 2800key_128_dec_update4: 2801 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2802 FUNC_RESTORE 2803 ret 2804key_256_dec_update4: 2805 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2806 FUNC_RESTORE 2807 ret 2808SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) 2809 2810############################################################################### 2811#void aesni_gcm_finalize_avx_gen4( 2812# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2813# gcm_context_data *data, 2814# u8 *auth_tag, /* Authenticated Tag output. */ 2815# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2816# Valid values are 16 (most likely), 12 or 8. */ 2817############################################################################### 2818SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) 2819 FUNC_SAVE 2820 mov keysize,%eax 2821 cmp $32, %eax 2822 je key_256_finalize4 2823 cmp $16, %eax 2824 je key_128_finalize4 2825 # must be 192 2826 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2827 FUNC_RESTORE 2828 ret 2829key_128_finalize4: 2830 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2831 FUNC_RESTORE 2832 ret 2833key_256_finalize4: 2834 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2835 FUNC_RESTORE 2836 ret 2837SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) 2838