1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123 124# constants in mergeable sections, linker can reorder and merge 125.section .rodata.cst16.POLY, "aM", @progbits, 16 126.align 16 127POLY: .octa 0xC2000000000000000000000000000001 128 129.section .rodata.cst16.POLY2, "aM", @progbits, 16 130.align 16 131POLY2: .octa 0xC20000000000000000000001C2000000 132 133.section .rodata.cst16.TWOONE, "aM", @progbits, 16 134.align 16 135TWOONE: .octa 0x00000001000000000000000000000001 136 137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 138.align 16 139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 140 141.section .rodata.cst16.ONE, "aM", @progbits, 16 142.align 16 143ONE: .octa 0x00000000000000000000000000000001 144 145.section .rodata.cst16.ONEf, "aM", @progbits, 16 146.align 16 147ONEf: .octa 0x01000000000000000000000000000000 148 149# order of these constants should not change. 150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 151.section .rodata, "a", @progbits 152.align 16 153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 154ALL_F: .octa 0xffffffffffffffffffffffffffffffff 155 .octa 0x00000000000000000000000000000000 156 157.section .rodata 158.align 16 159.type aad_shift_arr, @object 160.size aad_shift_arr, 272 161aad_shift_arr: 162 .octa 0xffffffffffffffffffffffffffffffff 163 .octa 0xffffffffffffffffffffffffffffff0C 164 .octa 0xffffffffffffffffffffffffffff0D0C 165 .octa 0xffffffffffffffffffffffffff0E0D0C 166 .octa 0xffffffffffffffffffffffff0F0E0D0C 167 .octa 0xffffffffffffffffffffff0C0B0A0908 168 .octa 0xffffffffffffffffffff0D0C0B0A0908 169 .octa 0xffffffffffffffffff0E0D0C0B0A0908 170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 171 .octa 0xffffffffffffff0C0B0A090807060504 172 .octa 0xffffffffffff0D0C0B0A090807060504 173 .octa 0xffffffffff0E0D0C0B0A090807060504 174 .octa 0xffffffff0F0E0D0C0B0A090807060504 175 .octa 0xffffff0C0B0A09080706050403020100 176 .octa 0xffff0D0C0B0A09080706050403020100 177 .octa 0xff0E0D0C0B0A09080706050403020100 178 .octa 0x0F0E0D0C0B0A09080706050403020100 179 180 181.text 182 183 184#define AadHash 16*0 185#define AadLen 16*1 186#define InLen (16*1)+8 187#define PBlockEncKey 16*2 188#define OrigIV 16*3 189#define CurCount 16*4 190#define PBlockLen 16*5 191 192HashKey = 16*6 # store HashKey <<1 mod poly here 193HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 194HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 195HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 196HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 197HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 198HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 199HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 200HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 201HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 202HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 203HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 204HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 205HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 206HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 207HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 208 209#define arg1 %rdi 210#define arg2 %rsi 211#define arg3 %rdx 212#define arg4 %rcx 213#define arg5 %r8 214#define arg6 %r9 215#define arg7 STACK_OFFSET+8*1(%r14) 216#define arg8 STACK_OFFSET+8*2(%r14) 217#define arg9 STACK_OFFSET+8*3(%r14) 218#define arg10 STACK_OFFSET+8*4(%r14) 219#define keysize 2*15*16(arg1) 220 221i = 0 222j = 0 223 224out_order = 0 225in_order = 1 226DEC = 0 227ENC = 1 228 229.macro define_reg r n 230reg_\r = %xmm\n 231.endm 232 233.macro setreg 234.altmacro 235define_reg i %i 236define_reg j %j 237.noaltmacro 238.endm 239 240# need to push 4 registers into stack to maintain 241STACK_OFFSET = 8*4 242 243TMP1 = 16*0 # Temporary storage for AAD 244TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 245TMP3 = 16*2 # Temporary storage for AES State 3 246TMP4 = 16*3 # Temporary storage for AES State 4 247TMP5 = 16*4 # Temporary storage for AES State 5 248TMP6 = 16*5 # Temporary storage for AES State 6 249TMP7 = 16*6 # Temporary storage for AES State 7 250TMP8 = 16*7 # Temporary storage for AES State 8 251 252VARIABLE_OFFSET = 16*8 253 254################################ 255# Utility Macros 256################################ 257 258.macro FUNC_SAVE 259 #the number of pushes must equal STACK_OFFSET 260 push %r12 261 push %r13 262 push %r14 263 push %r15 264 265 mov %rsp, %r14 266 267 268 269 sub $VARIABLE_OFFSET, %rsp 270 and $~63, %rsp # align rsp to 64 bytes 271.endm 272 273.macro FUNC_RESTORE 274 mov %r14, %rsp 275 276 pop %r15 277 pop %r14 278 pop %r13 279 pop %r12 280.endm 281 282# Encryption of a single block 283.macro ENCRYPT_SINGLE_BLOCK REP XMM0 284 vpxor (arg1), \XMM0, \XMM0 285 i = 1 286 setreg 287.rep \REP 288 vaesenc 16*i(arg1), \XMM0, \XMM0 289 i = (i+1) 290 setreg 291.endr 292 vaesenclast 16*i(arg1), \XMM0, \XMM0 293.endm 294 295# combined for GCM encrypt and decrypt functions 296# clobbering all xmm registers 297# clobbering r10, r11, r12, r13, r14, r15 298.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 299 vmovdqu AadHash(arg2), %xmm8 300 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 301 add arg5, InLen(arg2) 302 303 # initialize the data pointer offset as zero 304 xor %r11d, %r11d 305 306 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 307 sub %r11, arg5 308 309 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 310 and $-16, %r13 # r13 = r13 - (r13 mod 16) 311 312 mov %r13, %r12 313 shr $4, %r12 314 and $7, %r12 315 jz _initial_num_blocks_is_0\@ 316 317 cmp $7, %r12 318 je _initial_num_blocks_is_7\@ 319 cmp $6, %r12 320 je _initial_num_blocks_is_6\@ 321 cmp $5, %r12 322 je _initial_num_blocks_is_5\@ 323 cmp $4, %r12 324 je _initial_num_blocks_is_4\@ 325 cmp $3, %r12 326 je _initial_num_blocks_is_3\@ 327 cmp $2, %r12 328 je _initial_num_blocks_is_2\@ 329 330 jmp _initial_num_blocks_is_1\@ 331 332_initial_num_blocks_is_7\@: 333 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 334 sub $16*7, %r13 335 jmp _initial_blocks_encrypted\@ 336 337_initial_num_blocks_is_6\@: 338 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 339 sub $16*6, %r13 340 jmp _initial_blocks_encrypted\@ 341 342_initial_num_blocks_is_5\@: 343 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 344 sub $16*5, %r13 345 jmp _initial_blocks_encrypted\@ 346 347_initial_num_blocks_is_4\@: 348 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 349 sub $16*4, %r13 350 jmp _initial_blocks_encrypted\@ 351 352_initial_num_blocks_is_3\@: 353 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 354 sub $16*3, %r13 355 jmp _initial_blocks_encrypted\@ 356 357_initial_num_blocks_is_2\@: 358 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 359 sub $16*2, %r13 360 jmp _initial_blocks_encrypted\@ 361 362_initial_num_blocks_is_1\@: 363 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 364 sub $16*1, %r13 365 jmp _initial_blocks_encrypted\@ 366 367_initial_num_blocks_is_0\@: 368 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 369 370 371_initial_blocks_encrypted\@: 372 cmp $0, %r13 373 je _zero_cipher_left\@ 374 375 sub $128, %r13 376 je _eight_cipher_left\@ 377 378 379 380 381 vmovd %xmm9, %r15d 382 and $255, %r15d 383 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 384 385 386_encrypt_by_8_new\@: 387 cmp $(255-8), %r15d 388 jg _encrypt_by_8\@ 389 390 391 392 add $8, %r15b 393 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 394 add $128, %r11 395 sub $128, %r13 396 jne _encrypt_by_8_new\@ 397 398 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 399 jmp _eight_cipher_left\@ 400 401_encrypt_by_8\@: 402 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 403 add $8, %r15b 404 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 405 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 406 add $128, %r11 407 sub $128, %r13 408 jne _encrypt_by_8_new\@ 409 410 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 411 412 413 414 415_eight_cipher_left\@: 416 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 417 418 419_zero_cipher_left\@: 420 vmovdqu %xmm14, AadHash(arg2) 421 vmovdqu %xmm9, CurCount(arg2) 422 423 # check for 0 length 424 mov arg5, %r13 425 and $15, %r13 # r13 = (arg5 mod 16) 426 427 je _multiple_of_16_bytes\@ 428 429 # handle the last <16 Byte block separately 430 431 mov %r13, PBlockLen(arg2) 432 433 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 434 vmovdqu %xmm9, CurCount(arg2) 435 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 436 437 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 438 vmovdqu %xmm9, PBlockEncKey(arg2) 439 440 cmp $16, arg5 441 jge _large_enough_update\@ 442 443 lea (arg4,%r11,1), %r10 444 mov %r13, %r12 445 446 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 447 448 lea SHIFT_MASK+16(%rip), %r12 449 sub %r13, %r12 # adjust the shuffle mask pointer to be 450 # able to shift 16-r13 bytes (r13 is the 451 # number of bytes in plaintext mod 16) 452 453 jmp _final_ghash_mul\@ 454 455_large_enough_update\@: 456 sub $16, %r11 457 add %r13, %r11 458 459 # receive the last <16 Byte block 460 vmovdqu (arg4, %r11, 1), %xmm1 461 462 sub %r13, %r11 463 add $16, %r11 464 465 lea SHIFT_MASK+16(%rip), %r12 466 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 467 # (r13 is the number of bytes in plaintext mod 16) 468 sub %r13, %r12 469 # get the appropriate shuffle mask 470 vmovdqu (%r12), %xmm2 471 # shift right 16-r13 bytes 472 vpshufb %xmm2, %xmm1, %xmm1 473 474_final_ghash_mul\@: 475 .if \ENC_DEC == DEC 476 vmovdqa %xmm1, %xmm2 477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 479 # mask out top 16-r13 bytes of xmm9 480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 481 vpand %xmm1, %xmm2, %xmm2 482 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 483 vpxor %xmm2, %xmm14, %xmm14 484 485 vmovdqu %xmm14, AadHash(arg2) 486 .else 487 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 488 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 489 # mask out top 16-r13 bytes of xmm9 490 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 491 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 492 vpxor %xmm9, %xmm14, %xmm14 493 494 vmovdqu %xmm14, AadHash(arg2) 495 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 496 .endif 497 498 499 ############################# 500 # output r13 Bytes 501 vmovq %xmm9, %rax 502 cmp $8, %r13 503 jle _less_than_8_bytes_left\@ 504 505 mov %rax, (arg3 , %r11) 506 add $8, %r11 507 vpsrldq $8, %xmm9, %xmm9 508 vmovq %xmm9, %rax 509 sub $8, %r13 510 511_less_than_8_bytes_left\@: 512 movb %al, (arg3 , %r11) 513 add $1, %r11 514 shr $8, %rax 515 sub $1, %r13 516 jne _less_than_8_bytes_left\@ 517 ############################# 518 519_multiple_of_16_bytes\@: 520.endm 521 522 523# GCM_COMPLETE Finishes update of tag of last partial block 524# Output: Authorization Tag (AUTH_TAG) 525# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 526.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 527 vmovdqu AadHash(arg2), %xmm14 528 vmovdqu HashKey(arg2), %xmm13 529 530 mov PBlockLen(arg2), %r12 531 cmp $0, %r12 532 je _partial_done\@ 533 534 #GHASH computation for the last <16 Byte block 535 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 536 537_partial_done\@: 538 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 539 shl $3, %r12 # convert into number of bits 540 vmovd %r12d, %xmm15 # len(A) in xmm15 541 542 mov InLen(arg2), %r12 543 shl $3, %r12 # len(C) in bits (*128) 544 vmovq %r12, %xmm1 545 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 546 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 547 548 vpxor %xmm15, %xmm14, %xmm14 549 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 550 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 551 552 vmovdqu OrigIV(arg2), %xmm9 553 554 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 555 556 vpxor %xmm14, %xmm9, %xmm9 557 558 559 560_return_T\@: 561 mov \AUTH_TAG, %r10 # r10 = authTag 562 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 563 564 cmp $16, %r11 565 je _T_16\@ 566 567 cmp $8, %r11 568 jl _T_4\@ 569 570_T_8\@: 571 vmovq %xmm9, %rax 572 mov %rax, (%r10) 573 add $8, %r10 574 sub $8, %r11 575 vpsrldq $8, %xmm9, %xmm9 576 cmp $0, %r11 577 je _return_T_done\@ 578_T_4\@: 579 vmovd %xmm9, %eax 580 mov %eax, (%r10) 581 add $4, %r10 582 sub $4, %r11 583 vpsrldq $4, %xmm9, %xmm9 584 cmp $0, %r11 585 je _return_T_done\@ 586_T_123\@: 587 vmovd %xmm9, %eax 588 cmp $2, %r11 589 jl _T_1\@ 590 mov %ax, (%r10) 591 cmp $2, %r11 592 je _return_T_done\@ 593 add $2, %r10 594 sar $16, %eax 595_T_1\@: 596 mov %al, (%r10) 597 jmp _return_T_done\@ 598 599_T_16\@: 600 vmovdqu %xmm9, (%r10) 601 602_return_T_done\@: 603.endm 604 605.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 606 607 mov \AAD, %r10 # r10 = AAD 608 mov \AADLEN, %r12 # r12 = aadLen 609 610 611 mov %r12, %r11 612 613 vpxor \T8, \T8, \T8 614 vpxor \T7, \T7, \T7 615 cmp $16, %r11 616 jl _get_AAD_rest8\@ 617_get_AAD_blocks\@: 618 vmovdqu (%r10), \T7 619 vpshufb SHUF_MASK(%rip), \T7, \T7 620 vpxor \T7, \T8, \T8 621 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 622 add $16, %r10 623 sub $16, %r12 624 sub $16, %r11 625 cmp $16, %r11 626 jge _get_AAD_blocks\@ 627 vmovdqu \T8, \T7 628 cmp $0, %r11 629 je _get_AAD_done\@ 630 631 vpxor \T7, \T7, \T7 632 633 /* read the last <16B of AAD. since we have at least 4B of 634 data right after the AAD (the ICV, and maybe some CT), we can 635 read 4B/8B blocks safely, and then get rid of the extra stuff */ 636_get_AAD_rest8\@: 637 cmp $4, %r11 638 jle _get_AAD_rest4\@ 639 movq (%r10), \T1 640 add $8, %r10 641 sub $8, %r11 642 vpslldq $8, \T1, \T1 643 vpsrldq $8, \T7, \T7 644 vpxor \T1, \T7, \T7 645 jmp _get_AAD_rest8\@ 646_get_AAD_rest4\@: 647 cmp $0, %r11 648 jle _get_AAD_rest0\@ 649 mov (%r10), %eax 650 movq %rax, \T1 651 add $4, %r10 652 sub $4, %r11 653 vpslldq $12, \T1, \T1 654 vpsrldq $4, \T7, \T7 655 vpxor \T1, \T7, \T7 656_get_AAD_rest0\@: 657 /* finalize: shift out the extra bytes we read, and align 658 left. since pslldq can only shift by an immediate, we use 659 vpshufb and an array of shuffle masks */ 660 movq %r12, %r11 661 salq $4, %r11 662 vmovdqu aad_shift_arr(%r11), \T1 663 vpshufb \T1, \T7, \T7 664_get_AAD_rest_final\@: 665 vpshufb SHUF_MASK(%rip), \T7, \T7 666 vpxor \T8, \T7, \T7 667 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 668 669_get_AAD_done\@: 670 vmovdqu \T7, AadHash(arg2) 671.endm 672 673.macro INIT GHASH_MUL PRECOMPUTE 674 mov arg6, %r11 675 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 676 xor %r11d, %r11d 677 mov %r11, InLen(arg2) # ctx_data.in_length = 0 678 679 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 680 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 681 mov arg3, %rax 682 movdqu (%rax), %xmm0 683 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 684 685 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 686 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 687 688 vmovdqu (arg4), %xmm6 # xmm6 = HashKey 689 690 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 691 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 692 vmovdqa %xmm6, %xmm2 693 vpsllq $1, %xmm6, %xmm6 694 vpsrlq $63, %xmm2, %xmm2 695 vmovdqa %xmm2, %xmm1 696 vpslldq $8, %xmm2, %xmm2 697 vpsrldq $8, %xmm1, %xmm1 698 vpor %xmm2, %xmm6, %xmm6 699 #reduction 700 vpshufd $0b00100100, %xmm1, %xmm2 701 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 702 vpand POLY(%rip), %xmm2, %xmm2 703 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 704 ####################################################################### 705 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 706 707 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 708 709 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 710.endm 711 712 713# Reads DLEN bytes starting at DPTR and stores in XMMDst 714# where 0 < DLEN < 16 715# Clobbers %rax, DLEN 716.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 717 vpxor \XMMDst, \XMMDst, \XMMDst 718 719 cmp $8, \DLEN 720 jl _read_lt8_\@ 721 mov (\DPTR), %rax 722 vpinsrq $0, %rax, \XMMDst, \XMMDst 723 sub $8, \DLEN 724 jz _done_read_partial_block_\@ 725 xor %eax, %eax 726_read_next_byte_\@: 727 shl $8, %rax 728 mov 7(\DPTR, \DLEN, 1), %al 729 dec \DLEN 730 jnz _read_next_byte_\@ 731 vpinsrq $1, %rax, \XMMDst, \XMMDst 732 jmp _done_read_partial_block_\@ 733_read_lt8_\@: 734 xor %eax, %eax 735_read_next_byte_lt8_\@: 736 shl $8, %rax 737 mov -1(\DPTR, \DLEN, 1), %al 738 dec \DLEN 739 jnz _read_next_byte_lt8_\@ 740 vpinsrq $0, %rax, \XMMDst, \XMMDst 741_done_read_partial_block_\@: 742.endm 743 744# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 745# between update calls. 746# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 747# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 748# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 749.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 750 AAD_HASH ENC_DEC 751 mov PBlockLen(arg2), %r13 752 cmp $0, %r13 753 je _partial_block_done_\@ # Leave Macro if no partial blocks 754 # Read in input data without over reading 755 cmp $16, \PLAIN_CYPH_LEN 756 jl _fewer_than_16_bytes_\@ 757 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 758 jmp _data_read_\@ 759 760_fewer_than_16_bytes_\@: 761 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 762 mov \PLAIN_CYPH_LEN, %r12 763 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 764 765 mov PBlockLen(arg2), %r13 766 767_data_read_\@: # Finished reading in data 768 769 vmovdqu PBlockEncKey(arg2), %xmm9 770 vmovdqu HashKey(arg2), %xmm13 771 772 lea SHIFT_MASK(%rip), %r12 773 774 # adjust the shuffle mask pointer to be able to shift r13 bytes 775 # r16-r13 is the number of bytes in plaintext mod 16) 776 add %r13, %r12 777 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 778 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 779 780.if \ENC_DEC == DEC 781 vmovdqa %xmm1, %xmm3 782 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 783 784 mov \PLAIN_CYPH_LEN, %r10 785 add %r13, %r10 786 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 787 sub $16, %r10 788 # Determine if if partial block is not being filled and 789 # shift mask accordingly 790 jge _no_extra_mask_1_\@ 791 sub %r10, %r12 792_no_extra_mask_1_\@: 793 794 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 795 # get the appropriate mask to mask out bottom r13 bytes of xmm9 796 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 797 798 vpand %xmm1, %xmm3, %xmm3 799 vmovdqa SHUF_MASK(%rip), %xmm10 800 vpshufb %xmm10, %xmm3, %xmm3 801 vpshufb %xmm2, %xmm3, %xmm3 802 vpxor %xmm3, \AAD_HASH, \AAD_HASH 803 804 cmp $0, %r10 805 jl _partial_incomplete_1_\@ 806 807 # GHASH computation for the last <16 Byte block 808 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 809 xor %eax,%eax 810 811 mov %rax, PBlockLen(arg2) 812 jmp _dec_done_\@ 813_partial_incomplete_1_\@: 814 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 815_dec_done_\@: 816 vmovdqu \AAD_HASH, AadHash(arg2) 817.else 818 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 819 820 mov \PLAIN_CYPH_LEN, %r10 821 add %r13, %r10 822 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 823 sub $16, %r10 824 # Determine if if partial block is not being filled and 825 # shift mask accordingly 826 jge _no_extra_mask_2_\@ 827 sub %r10, %r12 828_no_extra_mask_2_\@: 829 830 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 831 # get the appropriate mask to mask out bottom r13 bytes of xmm9 832 vpand %xmm1, %xmm9, %xmm9 833 834 vmovdqa SHUF_MASK(%rip), %xmm1 835 vpshufb %xmm1, %xmm9, %xmm9 836 vpshufb %xmm2, %xmm9, %xmm9 837 vpxor %xmm9, \AAD_HASH, \AAD_HASH 838 839 cmp $0, %r10 840 jl _partial_incomplete_2_\@ 841 842 # GHASH computation for the last <16 Byte block 843 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 844 xor %eax,%eax 845 846 mov %rax, PBlockLen(arg2) 847 jmp _encode_done_\@ 848_partial_incomplete_2_\@: 849 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 850_encode_done_\@: 851 vmovdqu \AAD_HASH, AadHash(arg2) 852 853 vmovdqa SHUF_MASK(%rip), %xmm10 854 # shuffle xmm9 back to output as ciphertext 855 vpshufb %xmm10, %xmm9, %xmm9 856 vpshufb %xmm2, %xmm9, %xmm9 857.endif 858 # output encrypted Bytes 859 cmp $0, %r10 860 jl _partial_fill_\@ 861 mov %r13, %r12 862 mov $16, %r13 863 # Set r13 to be the number of bytes to write out 864 sub %r12, %r13 865 jmp _count_set_\@ 866_partial_fill_\@: 867 mov \PLAIN_CYPH_LEN, %r13 868_count_set_\@: 869 vmovdqa %xmm9, %xmm0 870 vmovq %xmm0, %rax 871 cmp $8, %r13 872 jle _less_than_8_bytes_left_\@ 873 874 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 875 add $8, \DATA_OFFSET 876 psrldq $8, %xmm0 877 vmovq %xmm0, %rax 878 sub $8, %r13 879_less_than_8_bytes_left_\@: 880 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 881 add $1, \DATA_OFFSET 882 shr $8, %rax 883 sub $1, %r13 884 jne _less_than_8_bytes_left_\@ 885_partial_block_done_\@: 886.endm # PARTIAL_BLOCK 887 888############################################################################### 889# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 890# Input: A and B (128-bits each, bit-reflected) 891# Output: C = A*B*x mod poly, (i.e. >>1 ) 892# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 893# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 894############################################################################### 895.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 896 897 vpshufd $0b01001110, \GH, \T2 898 vpshufd $0b01001110, \HK, \T3 899 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 900 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 901 902 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 903 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 904 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 905 vpxor \GH, \T2,\T2 906 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 907 908 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 909 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 910 vpxor \T3, \GH, \GH 911 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 912 913 #first phase of the reduction 914 vpslld $31, \GH, \T2 # packed right shifting << 31 915 vpslld $30, \GH, \T3 # packed right shifting shift << 30 916 vpslld $25, \GH, \T4 # packed right shifting shift << 25 917 918 vpxor \T3, \T2, \T2 # xor the shifted versions 919 vpxor \T4, \T2, \T2 920 921 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 922 923 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 924 vpxor \T2, \GH, \GH # first phase of the reduction complete 925 926 #second phase of the reduction 927 928 vpsrld $1,\GH, \T2 # packed left shifting >> 1 929 vpsrld $2,\GH, \T3 # packed left shifting >> 2 930 vpsrld $7,\GH, \T4 # packed left shifting >> 7 931 vpxor \T3, \T2, \T2 # xor the shifted versions 932 vpxor \T4, \T2, \T2 933 934 vpxor \T5, \T2, \T2 935 vpxor \T2, \GH, \GH 936 vpxor \T1, \GH, \GH # the result is in GH 937 938 939.endm 940 941.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 942 943 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 944 vmovdqa \HK, \T5 945 946 vpshufd $0b01001110, \T5, \T1 947 vpxor \T5, \T1, \T1 948 vmovdqu \T1, HashKey_k(arg2) 949 950 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 951 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 952 vpshufd $0b01001110, \T5, \T1 953 vpxor \T5, \T1, \T1 954 vmovdqu \T1, HashKey_2_k(arg2) 955 956 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 957 vmovdqu \T5, HashKey_3(arg2) 958 vpshufd $0b01001110, \T5, \T1 959 vpxor \T5, \T1, \T1 960 vmovdqu \T1, HashKey_3_k(arg2) 961 962 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 963 vmovdqu \T5, HashKey_4(arg2) 964 vpshufd $0b01001110, \T5, \T1 965 vpxor \T5, \T1, \T1 966 vmovdqu \T1, HashKey_4_k(arg2) 967 968 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 969 vmovdqu \T5, HashKey_5(arg2) 970 vpshufd $0b01001110, \T5, \T1 971 vpxor \T5, \T1, \T1 972 vmovdqu \T1, HashKey_5_k(arg2) 973 974 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 975 vmovdqu \T5, HashKey_6(arg2) 976 vpshufd $0b01001110, \T5, \T1 977 vpxor \T5, \T1, \T1 978 vmovdqu \T1, HashKey_6_k(arg2) 979 980 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 981 vmovdqu \T5, HashKey_7(arg2) 982 vpshufd $0b01001110, \T5, \T1 983 vpxor \T5, \T1, \T1 984 vmovdqu \T1, HashKey_7_k(arg2) 985 986 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 987 vmovdqu \T5, HashKey_8(arg2) 988 vpshufd $0b01001110, \T5, \T1 989 vpxor \T5, \T1, \T1 990 vmovdqu \T1, HashKey_8_k(arg2) 991 992.endm 993 994## if a = number of total plaintext bytes 995## b = floor(a/16) 996## num_initial_blocks = b mod 4# 997## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 998## r10, r11, r12, rax are clobbered 999## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1000 1001.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 1002 i = (8-\num_initial_blocks) 1003 setreg 1004 vmovdqu AadHash(arg2), reg_i 1005 1006 # start AES for num_initial_blocks blocks 1007 vmovdqu CurCount(arg2), \CTR 1008 1009 i = (9-\num_initial_blocks) 1010 setreg 1011.rep \num_initial_blocks 1012 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1013 vmovdqa \CTR, reg_i 1014 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1015 i = (i+1) 1016 setreg 1017.endr 1018 1019 vmovdqa (arg1), \T_key 1020 i = (9-\num_initial_blocks) 1021 setreg 1022.rep \num_initial_blocks 1023 vpxor \T_key, reg_i, reg_i 1024 i = (i+1) 1025 setreg 1026.endr 1027 1028 j = 1 1029 setreg 1030.rep \REP 1031 vmovdqa 16*j(arg1), \T_key 1032 i = (9-\num_initial_blocks) 1033 setreg 1034.rep \num_initial_blocks 1035 vaesenc \T_key, reg_i, reg_i 1036 i = (i+1) 1037 setreg 1038.endr 1039 1040 j = (j+1) 1041 setreg 1042.endr 1043 1044 vmovdqa 16*j(arg1), \T_key 1045 i = (9-\num_initial_blocks) 1046 setreg 1047.rep \num_initial_blocks 1048 vaesenclast \T_key, reg_i, reg_i 1049 i = (i+1) 1050 setreg 1051.endr 1052 1053 i = (9-\num_initial_blocks) 1054 setreg 1055.rep \num_initial_blocks 1056 vmovdqu (arg4, %r11), \T1 1057 vpxor \T1, reg_i, reg_i 1058 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1059 add $16, %r11 1060.if \ENC_DEC == DEC 1061 vmovdqa \T1, reg_i 1062.endif 1063 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1064 i = (i+1) 1065 setreg 1066.endr 1067 1068 1069 i = (8-\num_initial_blocks) 1070 j = (9-\num_initial_blocks) 1071 setreg 1072 1073.rep \num_initial_blocks 1074 vpxor reg_i, reg_j, reg_j 1075 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1076 i = (i+1) 1077 j = (j+1) 1078 setreg 1079.endr 1080 # XMM8 has the combined result here 1081 1082 vmovdqa \XMM8, TMP1(%rsp) 1083 vmovdqa \XMM8, \T3 1084 1085 cmp $128, %r13 1086 jl _initial_blocks_done\@ # no need for precomputed constants 1087 1088############################################################################### 1089# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1090 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1091 vmovdqa \CTR, \XMM1 1092 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1093 1094 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1095 vmovdqa \CTR, \XMM2 1096 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1097 1098 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1099 vmovdqa \CTR, \XMM3 1100 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1101 1102 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1103 vmovdqa \CTR, \XMM4 1104 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1105 1106 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1107 vmovdqa \CTR, \XMM5 1108 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1109 1110 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1111 vmovdqa \CTR, \XMM6 1112 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1113 1114 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1115 vmovdqa \CTR, \XMM7 1116 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1117 1118 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1119 vmovdqa \CTR, \XMM8 1120 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1121 1122 vmovdqa (arg1), \T_key 1123 vpxor \T_key, \XMM1, \XMM1 1124 vpxor \T_key, \XMM2, \XMM2 1125 vpxor \T_key, \XMM3, \XMM3 1126 vpxor \T_key, \XMM4, \XMM4 1127 vpxor \T_key, \XMM5, \XMM5 1128 vpxor \T_key, \XMM6, \XMM6 1129 vpxor \T_key, \XMM7, \XMM7 1130 vpxor \T_key, \XMM8, \XMM8 1131 1132 i = 1 1133 setreg 1134.rep \REP # do REP rounds 1135 vmovdqa 16*i(arg1), \T_key 1136 vaesenc \T_key, \XMM1, \XMM1 1137 vaesenc \T_key, \XMM2, \XMM2 1138 vaesenc \T_key, \XMM3, \XMM3 1139 vaesenc \T_key, \XMM4, \XMM4 1140 vaesenc \T_key, \XMM5, \XMM5 1141 vaesenc \T_key, \XMM6, \XMM6 1142 vaesenc \T_key, \XMM7, \XMM7 1143 vaesenc \T_key, \XMM8, \XMM8 1144 i = (i+1) 1145 setreg 1146.endr 1147 1148 vmovdqa 16*i(arg1), \T_key 1149 vaesenclast \T_key, \XMM1, \XMM1 1150 vaesenclast \T_key, \XMM2, \XMM2 1151 vaesenclast \T_key, \XMM3, \XMM3 1152 vaesenclast \T_key, \XMM4, \XMM4 1153 vaesenclast \T_key, \XMM5, \XMM5 1154 vaesenclast \T_key, \XMM6, \XMM6 1155 vaesenclast \T_key, \XMM7, \XMM7 1156 vaesenclast \T_key, \XMM8, \XMM8 1157 1158 vmovdqu (arg4, %r11), \T1 1159 vpxor \T1, \XMM1, \XMM1 1160 vmovdqu \XMM1, (arg3 , %r11) 1161 .if \ENC_DEC == DEC 1162 vmovdqa \T1, \XMM1 1163 .endif 1164 1165 vmovdqu 16*1(arg4, %r11), \T1 1166 vpxor \T1, \XMM2, \XMM2 1167 vmovdqu \XMM2, 16*1(arg3 , %r11) 1168 .if \ENC_DEC == DEC 1169 vmovdqa \T1, \XMM2 1170 .endif 1171 1172 vmovdqu 16*2(arg4, %r11), \T1 1173 vpxor \T1, \XMM3, \XMM3 1174 vmovdqu \XMM3, 16*2(arg3 , %r11) 1175 .if \ENC_DEC == DEC 1176 vmovdqa \T1, \XMM3 1177 .endif 1178 1179 vmovdqu 16*3(arg4, %r11), \T1 1180 vpxor \T1, \XMM4, \XMM4 1181 vmovdqu \XMM4, 16*3(arg3 , %r11) 1182 .if \ENC_DEC == DEC 1183 vmovdqa \T1, \XMM4 1184 .endif 1185 1186 vmovdqu 16*4(arg4, %r11), \T1 1187 vpxor \T1, \XMM5, \XMM5 1188 vmovdqu \XMM5, 16*4(arg3 , %r11) 1189 .if \ENC_DEC == DEC 1190 vmovdqa \T1, \XMM5 1191 .endif 1192 1193 vmovdqu 16*5(arg4, %r11), \T1 1194 vpxor \T1, \XMM6, \XMM6 1195 vmovdqu \XMM6, 16*5(arg3 , %r11) 1196 .if \ENC_DEC == DEC 1197 vmovdqa \T1, \XMM6 1198 .endif 1199 1200 vmovdqu 16*6(arg4, %r11), \T1 1201 vpxor \T1, \XMM7, \XMM7 1202 vmovdqu \XMM7, 16*6(arg3 , %r11) 1203 .if \ENC_DEC == DEC 1204 vmovdqa \T1, \XMM7 1205 .endif 1206 1207 vmovdqu 16*7(arg4, %r11), \T1 1208 vpxor \T1, \XMM8, \XMM8 1209 vmovdqu \XMM8, 16*7(arg3 , %r11) 1210 .if \ENC_DEC == DEC 1211 vmovdqa \T1, \XMM8 1212 .endif 1213 1214 add $128, %r11 1215 1216 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1217 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1218 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1219 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1220 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1221 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1222 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1223 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1224 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1225 1226############################################################################### 1227 1228_initial_blocks_done\@: 1229 1230.endm 1231 1232# encrypt 8 blocks at a time 1233# ghash the 8 previously encrypted ciphertext blocks 1234# arg1, arg3, arg4 are used as pointers only, not modified 1235# r11 is the data offset value 1236.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1237 1238 vmovdqa \XMM1, \T2 1239 vmovdqa \XMM2, TMP2(%rsp) 1240 vmovdqa \XMM3, TMP3(%rsp) 1241 vmovdqa \XMM4, TMP4(%rsp) 1242 vmovdqa \XMM5, TMP5(%rsp) 1243 vmovdqa \XMM6, TMP6(%rsp) 1244 vmovdqa \XMM7, TMP7(%rsp) 1245 vmovdqa \XMM8, TMP8(%rsp) 1246 1247.if \loop_idx == in_order 1248 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1249 vpaddd ONE(%rip), \XMM1, \XMM2 1250 vpaddd ONE(%rip), \XMM2, \XMM3 1251 vpaddd ONE(%rip), \XMM3, \XMM4 1252 vpaddd ONE(%rip), \XMM4, \XMM5 1253 vpaddd ONE(%rip), \XMM5, \XMM6 1254 vpaddd ONE(%rip), \XMM6, \XMM7 1255 vpaddd ONE(%rip), \XMM7, \XMM8 1256 vmovdqa \XMM8, \CTR 1257 1258 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1259 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1260 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1261 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1262 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1263 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1264 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1265 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1266.else 1267 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1268 vpaddd ONEf(%rip), \XMM1, \XMM2 1269 vpaddd ONEf(%rip), \XMM2, \XMM3 1270 vpaddd ONEf(%rip), \XMM3, \XMM4 1271 vpaddd ONEf(%rip), \XMM4, \XMM5 1272 vpaddd ONEf(%rip), \XMM5, \XMM6 1273 vpaddd ONEf(%rip), \XMM6, \XMM7 1274 vpaddd ONEf(%rip), \XMM7, \XMM8 1275 vmovdqa \XMM8, \CTR 1276.endif 1277 1278 1279 ####################################################################### 1280 1281 vmovdqu (arg1), \T1 1282 vpxor \T1, \XMM1, \XMM1 1283 vpxor \T1, \XMM2, \XMM2 1284 vpxor \T1, \XMM3, \XMM3 1285 vpxor \T1, \XMM4, \XMM4 1286 vpxor \T1, \XMM5, \XMM5 1287 vpxor \T1, \XMM6, \XMM6 1288 vpxor \T1, \XMM7, \XMM7 1289 vpxor \T1, \XMM8, \XMM8 1290 1291 ####################################################################### 1292 1293 1294 1295 1296 1297 vmovdqu 16*1(arg1), \T1 1298 vaesenc \T1, \XMM1, \XMM1 1299 vaesenc \T1, \XMM2, \XMM2 1300 vaesenc \T1, \XMM3, \XMM3 1301 vaesenc \T1, \XMM4, \XMM4 1302 vaesenc \T1, \XMM5, \XMM5 1303 vaesenc \T1, \XMM6, \XMM6 1304 vaesenc \T1, \XMM7, \XMM7 1305 vaesenc \T1, \XMM8, \XMM8 1306 1307 vmovdqu 16*2(arg1), \T1 1308 vaesenc \T1, \XMM1, \XMM1 1309 vaesenc \T1, \XMM2, \XMM2 1310 vaesenc \T1, \XMM3, \XMM3 1311 vaesenc \T1, \XMM4, \XMM4 1312 vaesenc \T1, \XMM5, \XMM5 1313 vaesenc \T1, \XMM6, \XMM6 1314 vaesenc \T1, \XMM7, \XMM7 1315 vaesenc \T1, \XMM8, \XMM8 1316 1317 1318 ####################################################################### 1319 1320 vmovdqu HashKey_8(arg2), \T5 1321 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1322 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1323 1324 vpshufd $0b01001110, \T2, \T6 1325 vpxor \T2, \T6, \T6 1326 1327 vmovdqu HashKey_8_k(arg2), \T5 1328 vpclmulqdq $0x00, \T5, \T6, \T6 1329 1330 vmovdqu 16*3(arg1), \T1 1331 vaesenc \T1, \XMM1, \XMM1 1332 vaesenc \T1, \XMM2, \XMM2 1333 vaesenc \T1, \XMM3, \XMM3 1334 vaesenc \T1, \XMM4, \XMM4 1335 vaesenc \T1, \XMM5, \XMM5 1336 vaesenc \T1, \XMM6, \XMM6 1337 vaesenc \T1, \XMM7, \XMM7 1338 vaesenc \T1, \XMM8, \XMM8 1339 1340 vmovdqa TMP2(%rsp), \T1 1341 vmovdqu HashKey_7(arg2), \T5 1342 vpclmulqdq $0x11, \T5, \T1, \T3 1343 vpxor \T3, \T4, \T4 1344 vpclmulqdq $0x00, \T5, \T1, \T3 1345 vpxor \T3, \T7, \T7 1346 1347 vpshufd $0b01001110, \T1, \T3 1348 vpxor \T1, \T3, \T3 1349 vmovdqu HashKey_7_k(arg2), \T5 1350 vpclmulqdq $0x10, \T5, \T3, \T3 1351 vpxor \T3, \T6, \T6 1352 1353 vmovdqu 16*4(arg1), \T1 1354 vaesenc \T1, \XMM1, \XMM1 1355 vaesenc \T1, \XMM2, \XMM2 1356 vaesenc \T1, \XMM3, \XMM3 1357 vaesenc \T1, \XMM4, \XMM4 1358 vaesenc \T1, \XMM5, \XMM5 1359 vaesenc \T1, \XMM6, \XMM6 1360 vaesenc \T1, \XMM7, \XMM7 1361 vaesenc \T1, \XMM8, \XMM8 1362 1363 ####################################################################### 1364 1365 vmovdqa TMP3(%rsp), \T1 1366 vmovdqu HashKey_6(arg2), \T5 1367 vpclmulqdq $0x11, \T5, \T1, \T3 1368 vpxor \T3, \T4, \T4 1369 vpclmulqdq $0x00, \T5, \T1, \T3 1370 vpxor \T3, \T7, \T7 1371 1372 vpshufd $0b01001110, \T1, \T3 1373 vpxor \T1, \T3, \T3 1374 vmovdqu HashKey_6_k(arg2), \T5 1375 vpclmulqdq $0x10, \T5, \T3, \T3 1376 vpxor \T3, \T6, \T6 1377 1378 vmovdqu 16*5(arg1), \T1 1379 vaesenc \T1, \XMM1, \XMM1 1380 vaesenc \T1, \XMM2, \XMM2 1381 vaesenc \T1, \XMM3, \XMM3 1382 vaesenc \T1, \XMM4, \XMM4 1383 vaesenc \T1, \XMM5, \XMM5 1384 vaesenc \T1, \XMM6, \XMM6 1385 vaesenc \T1, \XMM7, \XMM7 1386 vaesenc \T1, \XMM8, \XMM8 1387 1388 vmovdqa TMP4(%rsp), \T1 1389 vmovdqu HashKey_5(arg2), \T5 1390 vpclmulqdq $0x11, \T5, \T1, \T3 1391 vpxor \T3, \T4, \T4 1392 vpclmulqdq $0x00, \T5, \T1, \T3 1393 vpxor \T3, \T7, \T7 1394 1395 vpshufd $0b01001110, \T1, \T3 1396 vpxor \T1, \T3, \T3 1397 vmovdqu HashKey_5_k(arg2), \T5 1398 vpclmulqdq $0x10, \T5, \T3, \T3 1399 vpxor \T3, \T6, \T6 1400 1401 vmovdqu 16*6(arg1), \T1 1402 vaesenc \T1, \XMM1, \XMM1 1403 vaesenc \T1, \XMM2, \XMM2 1404 vaesenc \T1, \XMM3, \XMM3 1405 vaesenc \T1, \XMM4, \XMM4 1406 vaesenc \T1, \XMM5, \XMM5 1407 vaesenc \T1, \XMM6, \XMM6 1408 vaesenc \T1, \XMM7, \XMM7 1409 vaesenc \T1, \XMM8, \XMM8 1410 1411 1412 vmovdqa TMP5(%rsp), \T1 1413 vmovdqu HashKey_4(arg2), \T5 1414 vpclmulqdq $0x11, \T5, \T1, \T3 1415 vpxor \T3, \T4, \T4 1416 vpclmulqdq $0x00, \T5, \T1, \T3 1417 vpxor \T3, \T7, \T7 1418 1419 vpshufd $0b01001110, \T1, \T3 1420 vpxor \T1, \T3, \T3 1421 vmovdqu HashKey_4_k(arg2), \T5 1422 vpclmulqdq $0x10, \T5, \T3, \T3 1423 vpxor \T3, \T6, \T6 1424 1425 vmovdqu 16*7(arg1), \T1 1426 vaesenc \T1, \XMM1, \XMM1 1427 vaesenc \T1, \XMM2, \XMM2 1428 vaesenc \T1, \XMM3, \XMM3 1429 vaesenc \T1, \XMM4, \XMM4 1430 vaesenc \T1, \XMM5, \XMM5 1431 vaesenc \T1, \XMM6, \XMM6 1432 vaesenc \T1, \XMM7, \XMM7 1433 vaesenc \T1, \XMM8, \XMM8 1434 1435 vmovdqa TMP6(%rsp), \T1 1436 vmovdqu HashKey_3(arg2), \T5 1437 vpclmulqdq $0x11, \T5, \T1, \T3 1438 vpxor \T3, \T4, \T4 1439 vpclmulqdq $0x00, \T5, \T1, \T3 1440 vpxor \T3, \T7, \T7 1441 1442 vpshufd $0b01001110, \T1, \T3 1443 vpxor \T1, \T3, \T3 1444 vmovdqu HashKey_3_k(arg2), \T5 1445 vpclmulqdq $0x10, \T5, \T3, \T3 1446 vpxor \T3, \T6, \T6 1447 1448 1449 vmovdqu 16*8(arg1), \T1 1450 vaesenc \T1, \XMM1, \XMM1 1451 vaesenc \T1, \XMM2, \XMM2 1452 vaesenc \T1, \XMM3, \XMM3 1453 vaesenc \T1, \XMM4, \XMM4 1454 vaesenc \T1, \XMM5, \XMM5 1455 vaesenc \T1, \XMM6, \XMM6 1456 vaesenc \T1, \XMM7, \XMM7 1457 vaesenc \T1, \XMM8, \XMM8 1458 1459 vmovdqa TMP7(%rsp), \T1 1460 vmovdqu HashKey_2(arg2), \T5 1461 vpclmulqdq $0x11, \T5, \T1, \T3 1462 vpxor \T3, \T4, \T4 1463 vpclmulqdq $0x00, \T5, \T1, \T3 1464 vpxor \T3, \T7, \T7 1465 1466 vpshufd $0b01001110, \T1, \T3 1467 vpxor \T1, \T3, \T3 1468 vmovdqu HashKey_2_k(arg2), \T5 1469 vpclmulqdq $0x10, \T5, \T3, \T3 1470 vpxor \T3, \T6, \T6 1471 1472 ####################################################################### 1473 1474 vmovdqu 16*9(arg1), \T5 1475 vaesenc \T5, \XMM1, \XMM1 1476 vaesenc \T5, \XMM2, \XMM2 1477 vaesenc \T5, \XMM3, \XMM3 1478 vaesenc \T5, \XMM4, \XMM4 1479 vaesenc \T5, \XMM5, \XMM5 1480 vaesenc \T5, \XMM6, \XMM6 1481 vaesenc \T5, \XMM7, \XMM7 1482 vaesenc \T5, \XMM8, \XMM8 1483 1484 vmovdqa TMP8(%rsp), \T1 1485 vmovdqu HashKey(arg2), \T5 1486 vpclmulqdq $0x11, \T5, \T1, \T3 1487 vpxor \T3, \T4, \T4 1488 vpclmulqdq $0x00, \T5, \T1, \T3 1489 vpxor \T3, \T7, \T7 1490 1491 vpshufd $0b01001110, \T1, \T3 1492 vpxor \T1, \T3, \T3 1493 vmovdqu HashKey_k(arg2), \T5 1494 vpclmulqdq $0x10, \T5, \T3, \T3 1495 vpxor \T3, \T6, \T6 1496 1497 vpxor \T4, \T6, \T6 1498 vpxor \T7, \T6, \T6 1499 1500 vmovdqu 16*10(arg1), \T5 1501 1502 i = 11 1503 setreg 1504.rep (\REP-9) 1505 1506 vaesenc \T5, \XMM1, \XMM1 1507 vaesenc \T5, \XMM2, \XMM2 1508 vaesenc \T5, \XMM3, \XMM3 1509 vaesenc \T5, \XMM4, \XMM4 1510 vaesenc \T5, \XMM5, \XMM5 1511 vaesenc \T5, \XMM6, \XMM6 1512 vaesenc \T5, \XMM7, \XMM7 1513 vaesenc \T5, \XMM8, \XMM8 1514 1515 vmovdqu 16*i(arg1), \T5 1516 i = i + 1 1517 setreg 1518.endr 1519 1520 i = 0 1521 j = 1 1522 setreg 1523.rep 8 1524 vpxor 16*i(arg4, %r11), \T5, \T2 1525 .if \ENC_DEC == ENC 1526 vaesenclast \T2, reg_j, reg_j 1527 .else 1528 vaesenclast \T2, reg_j, \T3 1529 vmovdqu 16*i(arg4, %r11), reg_j 1530 vmovdqu \T3, 16*i(arg3, %r11) 1531 .endif 1532 i = (i+1) 1533 j = (j+1) 1534 setreg 1535.endr 1536 ####################################################################### 1537 1538 1539 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1540 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1541 vpxor \T3, \T7, \T7 1542 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1543 1544 1545 1546 ####################################################################### 1547 #first phase of the reduction 1548 ####################################################################### 1549 vpslld $31, \T7, \T2 # packed right shifting << 31 1550 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1551 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1552 1553 vpxor \T3, \T2, \T2 # xor the shifted versions 1554 vpxor \T4, \T2, \T2 1555 1556 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1557 1558 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1559 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1560 ####################################################################### 1561 .if \ENC_DEC == ENC 1562 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1563 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1564 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1565 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1566 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1567 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1568 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1569 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1570 .endif 1571 1572 ####################################################################### 1573 #second phase of the reduction 1574 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1575 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1576 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1577 vpxor \T3, \T2, \T2 # xor the shifted versions 1578 vpxor \T4, \T2, \T2 1579 1580 vpxor \T1, \T2, \T2 1581 vpxor \T2, \T7, \T7 1582 vpxor \T7, \T6, \T6 # the result is in T6 1583 ####################################################################### 1584 1585 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1586 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1587 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1588 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1589 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1590 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1591 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1592 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1593 1594 1595 vpxor \T6, \XMM1, \XMM1 1596 1597 1598 1599.endm 1600 1601 1602# GHASH the last 4 ciphertext blocks. 1603.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1604 1605 ## Karatsuba Method 1606 1607 1608 vpshufd $0b01001110, \XMM1, \T2 1609 vpxor \XMM1, \T2, \T2 1610 vmovdqu HashKey_8(arg2), \T5 1611 vpclmulqdq $0x11, \T5, \XMM1, \T6 1612 vpclmulqdq $0x00, \T5, \XMM1, \T7 1613 1614 vmovdqu HashKey_8_k(arg2), \T3 1615 vpclmulqdq $0x00, \T3, \T2, \XMM1 1616 1617 ###################### 1618 1619 vpshufd $0b01001110, \XMM2, \T2 1620 vpxor \XMM2, \T2, \T2 1621 vmovdqu HashKey_7(arg2), \T5 1622 vpclmulqdq $0x11, \T5, \XMM2, \T4 1623 vpxor \T4, \T6, \T6 1624 1625 vpclmulqdq $0x00, \T5, \XMM2, \T4 1626 vpxor \T4, \T7, \T7 1627 1628 vmovdqu HashKey_7_k(arg2), \T3 1629 vpclmulqdq $0x00, \T3, \T2, \T2 1630 vpxor \T2, \XMM1, \XMM1 1631 1632 ###################### 1633 1634 vpshufd $0b01001110, \XMM3, \T2 1635 vpxor \XMM3, \T2, \T2 1636 vmovdqu HashKey_6(arg2), \T5 1637 vpclmulqdq $0x11, \T5, \XMM3, \T4 1638 vpxor \T4, \T6, \T6 1639 1640 vpclmulqdq $0x00, \T5, \XMM3, \T4 1641 vpxor \T4, \T7, \T7 1642 1643 vmovdqu HashKey_6_k(arg2), \T3 1644 vpclmulqdq $0x00, \T3, \T2, \T2 1645 vpxor \T2, \XMM1, \XMM1 1646 1647 ###################### 1648 1649 vpshufd $0b01001110, \XMM4, \T2 1650 vpxor \XMM4, \T2, \T2 1651 vmovdqu HashKey_5(arg2), \T5 1652 vpclmulqdq $0x11, \T5, \XMM4, \T4 1653 vpxor \T4, \T6, \T6 1654 1655 vpclmulqdq $0x00, \T5, \XMM4, \T4 1656 vpxor \T4, \T7, \T7 1657 1658 vmovdqu HashKey_5_k(arg2), \T3 1659 vpclmulqdq $0x00, \T3, \T2, \T2 1660 vpxor \T2, \XMM1, \XMM1 1661 1662 ###################### 1663 1664 vpshufd $0b01001110, \XMM5, \T2 1665 vpxor \XMM5, \T2, \T2 1666 vmovdqu HashKey_4(arg2), \T5 1667 vpclmulqdq $0x11, \T5, \XMM5, \T4 1668 vpxor \T4, \T6, \T6 1669 1670 vpclmulqdq $0x00, \T5, \XMM5, \T4 1671 vpxor \T4, \T7, \T7 1672 1673 vmovdqu HashKey_4_k(arg2), \T3 1674 vpclmulqdq $0x00, \T3, \T2, \T2 1675 vpxor \T2, \XMM1, \XMM1 1676 1677 ###################### 1678 1679 vpshufd $0b01001110, \XMM6, \T2 1680 vpxor \XMM6, \T2, \T2 1681 vmovdqu HashKey_3(arg2), \T5 1682 vpclmulqdq $0x11, \T5, \XMM6, \T4 1683 vpxor \T4, \T6, \T6 1684 1685 vpclmulqdq $0x00, \T5, \XMM6, \T4 1686 vpxor \T4, \T7, \T7 1687 1688 vmovdqu HashKey_3_k(arg2), \T3 1689 vpclmulqdq $0x00, \T3, \T2, \T2 1690 vpxor \T2, \XMM1, \XMM1 1691 1692 ###################### 1693 1694 vpshufd $0b01001110, \XMM7, \T2 1695 vpxor \XMM7, \T2, \T2 1696 vmovdqu HashKey_2(arg2), \T5 1697 vpclmulqdq $0x11, \T5, \XMM7, \T4 1698 vpxor \T4, \T6, \T6 1699 1700 vpclmulqdq $0x00, \T5, \XMM7, \T4 1701 vpxor \T4, \T7, \T7 1702 1703 vmovdqu HashKey_2_k(arg2), \T3 1704 vpclmulqdq $0x00, \T3, \T2, \T2 1705 vpxor \T2, \XMM1, \XMM1 1706 1707 ###################### 1708 1709 vpshufd $0b01001110, \XMM8, \T2 1710 vpxor \XMM8, \T2, \T2 1711 vmovdqu HashKey(arg2), \T5 1712 vpclmulqdq $0x11, \T5, \XMM8, \T4 1713 vpxor \T4, \T6, \T6 1714 1715 vpclmulqdq $0x00, \T5, \XMM8, \T4 1716 vpxor \T4, \T7, \T7 1717 1718 vmovdqu HashKey_k(arg2), \T3 1719 vpclmulqdq $0x00, \T3, \T2, \T2 1720 1721 vpxor \T2, \XMM1, \XMM1 1722 vpxor \T6, \XMM1, \XMM1 1723 vpxor \T7, \XMM1, \T2 1724 1725 1726 1727 1728 vpslldq $8, \T2, \T4 1729 vpsrldq $8, \T2, \T2 1730 1731 vpxor \T4, \T7, \T7 1732 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1733 # the accumulated carry-less multiplications 1734 1735 ####################################################################### 1736 #first phase of the reduction 1737 vpslld $31, \T7, \T2 # packed right shifting << 31 1738 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1739 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1740 1741 vpxor \T3, \T2, \T2 # xor the shifted versions 1742 vpxor \T4, \T2, \T2 1743 1744 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1745 1746 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1747 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1748 ####################################################################### 1749 1750 1751 #second phase of the reduction 1752 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1753 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1754 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1755 vpxor \T3, \T2, \T2 # xor the shifted versions 1756 vpxor \T4, \T2, \T2 1757 1758 vpxor \T1, \T2, \T2 1759 vpxor \T2, \T7, \T7 1760 vpxor \T7, \T6, \T6 # the result is in T6 1761 1762.endm 1763 1764############################################################# 1765#void aesni_gcm_precomp_avx_gen2 1766# (gcm_data *my_ctx_data, 1767# gcm_context_data *data, 1768# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1769# u8 *iv, /* Pre-counter block j0: 4 byte salt 1770# (from Security Association) concatenated with 8 byte 1771# Initialisation Vector (from IPSec ESP Payload) 1772# concatenated with 0x00000001. 16-byte aligned pointer. */ 1773# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1774# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1775############################################################# 1776SYM_FUNC_START(aesni_gcm_init_avx_gen2) 1777 FUNC_SAVE 1778 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1779 FUNC_RESTORE 1780 ret 1781SYM_FUNC_END(aesni_gcm_init_avx_gen2) 1782 1783############################################################################### 1784#void aesni_gcm_enc_update_avx_gen2( 1785# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1786# gcm_context_data *data, 1787# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1788# const u8 *in, /* Plaintext input */ 1789# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1790############################################################################### 1791SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) 1792 FUNC_SAVE 1793 mov keysize, %eax 1794 cmp $32, %eax 1795 je key_256_enc_update 1796 cmp $16, %eax 1797 je key_128_enc_update 1798 # must be 192 1799 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1800 FUNC_RESTORE 1801 ret 1802key_128_enc_update: 1803 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1804 FUNC_RESTORE 1805 ret 1806key_256_enc_update: 1807 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1808 FUNC_RESTORE 1809 ret 1810SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) 1811 1812############################################################################### 1813#void aesni_gcm_dec_update_avx_gen2( 1814# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1815# gcm_context_data *data, 1816# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1817# const u8 *in, /* Ciphertext input */ 1818# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1819############################################################################### 1820SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) 1821 FUNC_SAVE 1822 mov keysize,%eax 1823 cmp $32, %eax 1824 je key_256_dec_update 1825 cmp $16, %eax 1826 je key_128_dec_update 1827 # must be 192 1828 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1829 FUNC_RESTORE 1830 ret 1831key_128_dec_update: 1832 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1833 FUNC_RESTORE 1834 ret 1835key_256_dec_update: 1836 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1837 FUNC_RESTORE 1838 ret 1839SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) 1840 1841############################################################################### 1842#void aesni_gcm_finalize_avx_gen2( 1843# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1844# gcm_context_data *data, 1845# u8 *auth_tag, /* Authenticated Tag output. */ 1846# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1847# Valid values are 16 (most likely), 12 or 8. */ 1848############################################################################### 1849SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) 1850 FUNC_SAVE 1851 mov keysize,%eax 1852 cmp $32, %eax 1853 je key_256_finalize 1854 cmp $16, %eax 1855 je key_128_finalize 1856 # must be 192 1857 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1858 FUNC_RESTORE 1859 ret 1860key_128_finalize: 1861 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1862 FUNC_RESTORE 1863 ret 1864key_256_finalize: 1865 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1866 FUNC_RESTORE 1867 ret 1868SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) 1869 1870############################################################################### 1871# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1872# Input: A and B (128-bits each, bit-reflected) 1873# Output: C = A*B*x mod poly, (i.e. >>1 ) 1874# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1875# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1876############################################################################### 1877.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1878 1879 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1880 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1881 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1882 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1883 vpxor \T3, \GH, \GH 1884 1885 1886 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1887 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1888 1889 vpxor \T3, \T1, \T1 1890 vpxor \T2, \GH, \GH 1891 1892 ####################################################################### 1893 #first phase of the reduction 1894 vmovdqa POLY2(%rip), \T3 1895 1896 vpclmulqdq $0x01, \GH, \T3, \T2 1897 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1898 1899 vpxor \T2, \GH, \GH # first phase of the reduction complete 1900 ####################################################################### 1901 #second phase of the reduction 1902 vpclmulqdq $0x00, \GH, \T3, \T2 1903 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1904 1905 vpclmulqdq $0x10, \GH, \T3, \GH 1906 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1907 1908 vpxor \T2, \GH, \GH # second phase of the reduction complete 1909 ####################################################################### 1910 vpxor \T1, \GH, \GH # the result is in GH 1911 1912 1913.endm 1914 1915.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1916 1917 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1918 vmovdqa \HK, \T5 1919 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1920 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1921 1922 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1923 vmovdqu \T5, HashKey_3(arg2) 1924 1925 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1926 vmovdqu \T5, HashKey_4(arg2) 1927 1928 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1929 vmovdqu \T5, HashKey_5(arg2) 1930 1931 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1932 vmovdqu \T5, HashKey_6(arg2) 1933 1934 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1935 vmovdqu \T5, HashKey_7(arg2) 1936 1937 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1938 vmovdqu \T5, HashKey_8(arg2) 1939 1940.endm 1941 1942## if a = number of total plaintext bytes 1943## b = floor(a/16) 1944## num_initial_blocks = b mod 4# 1945## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1946## r10, r11, r12, rax are clobbered 1947## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1948 1949.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1950 i = (8-\num_initial_blocks) 1951 setreg 1952 vmovdqu AadHash(arg2), reg_i 1953 1954 # start AES for num_initial_blocks blocks 1955 vmovdqu CurCount(arg2), \CTR 1956 1957 i = (9-\num_initial_blocks) 1958 setreg 1959.rep \num_initial_blocks 1960 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1961 vmovdqa \CTR, reg_i 1962 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1963 i = (i+1) 1964 setreg 1965.endr 1966 1967 vmovdqa (arg1), \T_key 1968 i = (9-\num_initial_blocks) 1969 setreg 1970.rep \num_initial_blocks 1971 vpxor \T_key, reg_i, reg_i 1972 i = (i+1) 1973 setreg 1974.endr 1975 1976 j = 1 1977 setreg 1978.rep \REP 1979 vmovdqa 16*j(arg1), \T_key 1980 i = (9-\num_initial_blocks) 1981 setreg 1982.rep \num_initial_blocks 1983 vaesenc \T_key, reg_i, reg_i 1984 i = (i+1) 1985 setreg 1986.endr 1987 1988 j = (j+1) 1989 setreg 1990.endr 1991 1992 1993 vmovdqa 16*j(arg1), \T_key 1994 i = (9-\num_initial_blocks) 1995 setreg 1996.rep \num_initial_blocks 1997 vaesenclast \T_key, reg_i, reg_i 1998 i = (i+1) 1999 setreg 2000.endr 2001 2002 i = (9-\num_initial_blocks) 2003 setreg 2004.rep \num_initial_blocks 2005 vmovdqu (arg4, %r11), \T1 2006 vpxor \T1, reg_i, reg_i 2007 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 2008 # num_initial_blocks blocks 2009 add $16, %r11 2010.if \ENC_DEC == DEC 2011 vmovdqa \T1, reg_i 2012.endif 2013 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 2014 i = (i+1) 2015 setreg 2016.endr 2017 2018 2019 i = (8-\num_initial_blocks) 2020 j = (9-\num_initial_blocks) 2021 setreg 2022 2023.rep \num_initial_blocks 2024 vpxor reg_i, reg_j, reg_j 2025 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 2026 i = (i+1) 2027 j = (j+1) 2028 setreg 2029.endr 2030 # XMM8 has the combined result here 2031 2032 vmovdqa \XMM8, TMP1(%rsp) 2033 vmovdqa \XMM8, \T3 2034 2035 cmp $128, %r13 2036 jl _initial_blocks_done\@ # no need for precomputed constants 2037 2038############################################################################### 2039# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2040 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2041 vmovdqa \CTR, \XMM1 2042 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2043 2044 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2045 vmovdqa \CTR, \XMM2 2046 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2047 2048 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2049 vmovdqa \CTR, \XMM3 2050 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2051 2052 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2053 vmovdqa \CTR, \XMM4 2054 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2055 2056 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2057 vmovdqa \CTR, \XMM5 2058 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2059 2060 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2061 vmovdqa \CTR, \XMM6 2062 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2063 2064 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2065 vmovdqa \CTR, \XMM7 2066 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2067 2068 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2069 vmovdqa \CTR, \XMM8 2070 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2071 2072 vmovdqa (arg1), \T_key 2073 vpxor \T_key, \XMM1, \XMM1 2074 vpxor \T_key, \XMM2, \XMM2 2075 vpxor \T_key, \XMM3, \XMM3 2076 vpxor \T_key, \XMM4, \XMM4 2077 vpxor \T_key, \XMM5, \XMM5 2078 vpxor \T_key, \XMM6, \XMM6 2079 vpxor \T_key, \XMM7, \XMM7 2080 vpxor \T_key, \XMM8, \XMM8 2081 2082 i = 1 2083 setreg 2084.rep \REP # do REP rounds 2085 vmovdqa 16*i(arg1), \T_key 2086 vaesenc \T_key, \XMM1, \XMM1 2087 vaesenc \T_key, \XMM2, \XMM2 2088 vaesenc \T_key, \XMM3, \XMM3 2089 vaesenc \T_key, \XMM4, \XMM4 2090 vaesenc \T_key, \XMM5, \XMM5 2091 vaesenc \T_key, \XMM6, \XMM6 2092 vaesenc \T_key, \XMM7, \XMM7 2093 vaesenc \T_key, \XMM8, \XMM8 2094 i = (i+1) 2095 setreg 2096.endr 2097 2098 2099 vmovdqa 16*i(arg1), \T_key 2100 vaesenclast \T_key, \XMM1, \XMM1 2101 vaesenclast \T_key, \XMM2, \XMM2 2102 vaesenclast \T_key, \XMM3, \XMM3 2103 vaesenclast \T_key, \XMM4, \XMM4 2104 vaesenclast \T_key, \XMM5, \XMM5 2105 vaesenclast \T_key, \XMM6, \XMM6 2106 vaesenclast \T_key, \XMM7, \XMM7 2107 vaesenclast \T_key, \XMM8, \XMM8 2108 2109 vmovdqu (arg4, %r11), \T1 2110 vpxor \T1, \XMM1, \XMM1 2111 vmovdqu \XMM1, (arg3 , %r11) 2112 .if \ENC_DEC == DEC 2113 vmovdqa \T1, \XMM1 2114 .endif 2115 2116 vmovdqu 16*1(arg4, %r11), \T1 2117 vpxor \T1, \XMM2, \XMM2 2118 vmovdqu \XMM2, 16*1(arg3 , %r11) 2119 .if \ENC_DEC == DEC 2120 vmovdqa \T1, \XMM2 2121 .endif 2122 2123 vmovdqu 16*2(arg4, %r11), \T1 2124 vpxor \T1, \XMM3, \XMM3 2125 vmovdqu \XMM3, 16*2(arg3 , %r11) 2126 .if \ENC_DEC == DEC 2127 vmovdqa \T1, \XMM3 2128 .endif 2129 2130 vmovdqu 16*3(arg4, %r11), \T1 2131 vpxor \T1, \XMM4, \XMM4 2132 vmovdqu \XMM4, 16*3(arg3 , %r11) 2133 .if \ENC_DEC == DEC 2134 vmovdqa \T1, \XMM4 2135 .endif 2136 2137 vmovdqu 16*4(arg4, %r11), \T1 2138 vpxor \T1, \XMM5, \XMM5 2139 vmovdqu \XMM5, 16*4(arg3 , %r11) 2140 .if \ENC_DEC == DEC 2141 vmovdqa \T1, \XMM5 2142 .endif 2143 2144 vmovdqu 16*5(arg4, %r11), \T1 2145 vpxor \T1, \XMM6, \XMM6 2146 vmovdqu \XMM6, 16*5(arg3 , %r11) 2147 .if \ENC_DEC == DEC 2148 vmovdqa \T1, \XMM6 2149 .endif 2150 2151 vmovdqu 16*6(arg4, %r11), \T1 2152 vpxor \T1, \XMM7, \XMM7 2153 vmovdqu \XMM7, 16*6(arg3 , %r11) 2154 .if \ENC_DEC == DEC 2155 vmovdqa \T1, \XMM7 2156 .endif 2157 2158 vmovdqu 16*7(arg4, %r11), \T1 2159 vpxor \T1, \XMM8, \XMM8 2160 vmovdqu \XMM8, 16*7(arg3 , %r11) 2161 .if \ENC_DEC == DEC 2162 vmovdqa \T1, \XMM8 2163 .endif 2164 2165 add $128, %r11 2166 2167 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2168 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2169 # the corresponding ciphertext 2170 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2171 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2172 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2173 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2174 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2175 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2176 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2177 2178############################################################################### 2179 2180_initial_blocks_done\@: 2181 2182 2183.endm 2184 2185 2186 2187# encrypt 8 blocks at a time 2188# ghash the 8 previously encrypted ciphertext blocks 2189# arg1, arg3, arg4 are used as pointers only, not modified 2190# r11 is the data offset value 2191.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2192 2193 vmovdqa \XMM1, \T2 2194 vmovdqa \XMM2, TMP2(%rsp) 2195 vmovdqa \XMM3, TMP3(%rsp) 2196 vmovdqa \XMM4, TMP4(%rsp) 2197 vmovdqa \XMM5, TMP5(%rsp) 2198 vmovdqa \XMM6, TMP6(%rsp) 2199 vmovdqa \XMM7, TMP7(%rsp) 2200 vmovdqa \XMM8, TMP8(%rsp) 2201 2202.if \loop_idx == in_order 2203 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2204 vpaddd ONE(%rip), \XMM1, \XMM2 2205 vpaddd ONE(%rip), \XMM2, \XMM3 2206 vpaddd ONE(%rip), \XMM3, \XMM4 2207 vpaddd ONE(%rip), \XMM4, \XMM5 2208 vpaddd ONE(%rip), \XMM5, \XMM6 2209 vpaddd ONE(%rip), \XMM6, \XMM7 2210 vpaddd ONE(%rip), \XMM7, \XMM8 2211 vmovdqa \XMM8, \CTR 2212 2213 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2214 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2215 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2216 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2217 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2218 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2219 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2220 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2221.else 2222 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2223 vpaddd ONEf(%rip), \XMM1, \XMM2 2224 vpaddd ONEf(%rip), \XMM2, \XMM3 2225 vpaddd ONEf(%rip), \XMM3, \XMM4 2226 vpaddd ONEf(%rip), \XMM4, \XMM5 2227 vpaddd ONEf(%rip), \XMM5, \XMM6 2228 vpaddd ONEf(%rip), \XMM6, \XMM7 2229 vpaddd ONEf(%rip), \XMM7, \XMM8 2230 vmovdqa \XMM8, \CTR 2231.endif 2232 2233 2234 ####################################################################### 2235 2236 vmovdqu (arg1), \T1 2237 vpxor \T1, \XMM1, \XMM1 2238 vpxor \T1, \XMM2, \XMM2 2239 vpxor \T1, \XMM3, \XMM3 2240 vpxor \T1, \XMM4, \XMM4 2241 vpxor \T1, \XMM5, \XMM5 2242 vpxor \T1, \XMM6, \XMM6 2243 vpxor \T1, \XMM7, \XMM7 2244 vpxor \T1, \XMM8, \XMM8 2245 2246 ####################################################################### 2247 2248 2249 2250 2251 2252 vmovdqu 16*1(arg1), \T1 2253 vaesenc \T1, \XMM1, \XMM1 2254 vaesenc \T1, \XMM2, \XMM2 2255 vaesenc \T1, \XMM3, \XMM3 2256 vaesenc \T1, \XMM4, \XMM4 2257 vaesenc \T1, \XMM5, \XMM5 2258 vaesenc \T1, \XMM6, \XMM6 2259 vaesenc \T1, \XMM7, \XMM7 2260 vaesenc \T1, \XMM8, \XMM8 2261 2262 vmovdqu 16*2(arg1), \T1 2263 vaesenc \T1, \XMM1, \XMM1 2264 vaesenc \T1, \XMM2, \XMM2 2265 vaesenc \T1, \XMM3, \XMM3 2266 vaesenc \T1, \XMM4, \XMM4 2267 vaesenc \T1, \XMM5, \XMM5 2268 vaesenc \T1, \XMM6, \XMM6 2269 vaesenc \T1, \XMM7, \XMM7 2270 vaesenc \T1, \XMM8, \XMM8 2271 2272 2273 ####################################################################### 2274 2275 vmovdqu HashKey_8(arg2), \T5 2276 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2277 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2278 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2279 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2280 vpxor \T5, \T6, \T6 2281 2282 vmovdqu 16*3(arg1), \T1 2283 vaesenc \T1, \XMM1, \XMM1 2284 vaesenc \T1, \XMM2, \XMM2 2285 vaesenc \T1, \XMM3, \XMM3 2286 vaesenc \T1, \XMM4, \XMM4 2287 vaesenc \T1, \XMM5, \XMM5 2288 vaesenc \T1, \XMM6, \XMM6 2289 vaesenc \T1, \XMM7, \XMM7 2290 vaesenc \T1, \XMM8, \XMM8 2291 2292 vmovdqa TMP2(%rsp), \T1 2293 vmovdqu HashKey_7(arg2), \T5 2294 vpclmulqdq $0x11, \T5, \T1, \T3 2295 vpxor \T3, \T4, \T4 2296 2297 vpclmulqdq $0x00, \T5, \T1, \T3 2298 vpxor \T3, \T7, \T7 2299 2300 vpclmulqdq $0x01, \T5, \T1, \T3 2301 vpxor \T3, \T6, \T6 2302 2303 vpclmulqdq $0x10, \T5, \T1, \T3 2304 vpxor \T3, \T6, \T6 2305 2306 vmovdqu 16*4(arg1), \T1 2307 vaesenc \T1, \XMM1, \XMM1 2308 vaesenc \T1, \XMM2, \XMM2 2309 vaesenc \T1, \XMM3, \XMM3 2310 vaesenc \T1, \XMM4, \XMM4 2311 vaesenc \T1, \XMM5, \XMM5 2312 vaesenc \T1, \XMM6, \XMM6 2313 vaesenc \T1, \XMM7, \XMM7 2314 vaesenc \T1, \XMM8, \XMM8 2315 2316 ####################################################################### 2317 2318 vmovdqa TMP3(%rsp), \T1 2319 vmovdqu HashKey_6(arg2), \T5 2320 vpclmulqdq $0x11, \T5, \T1, \T3 2321 vpxor \T3, \T4, \T4 2322 2323 vpclmulqdq $0x00, \T5, \T1, \T3 2324 vpxor \T3, \T7, \T7 2325 2326 vpclmulqdq $0x01, \T5, \T1, \T3 2327 vpxor \T3, \T6, \T6 2328 2329 vpclmulqdq $0x10, \T5, \T1, \T3 2330 vpxor \T3, \T6, \T6 2331 2332 vmovdqu 16*5(arg1), \T1 2333 vaesenc \T1, \XMM1, \XMM1 2334 vaesenc \T1, \XMM2, \XMM2 2335 vaesenc \T1, \XMM3, \XMM3 2336 vaesenc \T1, \XMM4, \XMM4 2337 vaesenc \T1, \XMM5, \XMM5 2338 vaesenc \T1, \XMM6, \XMM6 2339 vaesenc \T1, \XMM7, \XMM7 2340 vaesenc \T1, \XMM8, \XMM8 2341 2342 vmovdqa TMP4(%rsp), \T1 2343 vmovdqu HashKey_5(arg2), \T5 2344 vpclmulqdq $0x11, \T5, \T1, \T3 2345 vpxor \T3, \T4, \T4 2346 2347 vpclmulqdq $0x00, \T5, \T1, \T3 2348 vpxor \T3, \T7, \T7 2349 2350 vpclmulqdq $0x01, \T5, \T1, \T3 2351 vpxor \T3, \T6, \T6 2352 2353 vpclmulqdq $0x10, \T5, \T1, \T3 2354 vpxor \T3, \T6, \T6 2355 2356 vmovdqu 16*6(arg1), \T1 2357 vaesenc \T1, \XMM1, \XMM1 2358 vaesenc \T1, \XMM2, \XMM2 2359 vaesenc \T1, \XMM3, \XMM3 2360 vaesenc \T1, \XMM4, \XMM4 2361 vaesenc \T1, \XMM5, \XMM5 2362 vaesenc \T1, \XMM6, \XMM6 2363 vaesenc \T1, \XMM7, \XMM7 2364 vaesenc \T1, \XMM8, \XMM8 2365 2366 2367 vmovdqa TMP5(%rsp), \T1 2368 vmovdqu HashKey_4(arg2), \T5 2369 vpclmulqdq $0x11, \T5, \T1, \T3 2370 vpxor \T3, \T4, \T4 2371 2372 vpclmulqdq $0x00, \T5, \T1, \T3 2373 vpxor \T3, \T7, \T7 2374 2375 vpclmulqdq $0x01, \T5, \T1, \T3 2376 vpxor \T3, \T6, \T6 2377 2378 vpclmulqdq $0x10, \T5, \T1, \T3 2379 vpxor \T3, \T6, \T6 2380 2381 vmovdqu 16*7(arg1), \T1 2382 vaesenc \T1, \XMM1, \XMM1 2383 vaesenc \T1, \XMM2, \XMM2 2384 vaesenc \T1, \XMM3, \XMM3 2385 vaesenc \T1, \XMM4, \XMM4 2386 vaesenc \T1, \XMM5, \XMM5 2387 vaesenc \T1, \XMM6, \XMM6 2388 vaesenc \T1, \XMM7, \XMM7 2389 vaesenc \T1, \XMM8, \XMM8 2390 2391 vmovdqa TMP6(%rsp), \T1 2392 vmovdqu HashKey_3(arg2), \T5 2393 vpclmulqdq $0x11, \T5, \T1, \T3 2394 vpxor \T3, \T4, \T4 2395 2396 vpclmulqdq $0x00, \T5, \T1, \T3 2397 vpxor \T3, \T7, \T7 2398 2399 vpclmulqdq $0x01, \T5, \T1, \T3 2400 vpxor \T3, \T6, \T6 2401 2402 vpclmulqdq $0x10, \T5, \T1, \T3 2403 vpxor \T3, \T6, \T6 2404 2405 vmovdqu 16*8(arg1), \T1 2406 vaesenc \T1, \XMM1, \XMM1 2407 vaesenc \T1, \XMM2, \XMM2 2408 vaesenc \T1, \XMM3, \XMM3 2409 vaesenc \T1, \XMM4, \XMM4 2410 vaesenc \T1, \XMM5, \XMM5 2411 vaesenc \T1, \XMM6, \XMM6 2412 vaesenc \T1, \XMM7, \XMM7 2413 vaesenc \T1, \XMM8, \XMM8 2414 2415 vmovdqa TMP7(%rsp), \T1 2416 vmovdqu HashKey_2(arg2), \T5 2417 vpclmulqdq $0x11, \T5, \T1, \T3 2418 vpxor \T3, \T4, \T4 2419 2420 vpclmulqdq $0x00, \T5, \T1, \T3 2421 vpxor \T3, \T7, \T7 2422 2423 vpclmulqdq $0x01, \T5, \T1, \T3 2424 vpxor \T3, \T6, \T6 2425 2426 vpclmulqdq $0x10, \T5, \T1, \T3 2427 vpxor \T3, \T6, \T6 2428 2429 2430 ####################################################################### 2431 2432 vmovdqu 16*9(arg1), \T5 2433 vaesenc \T5, \XMM1, \XMM1 2434 vaesenc \T5, \XMM2, \XMM2 2435 vaesenc \T5, \XMM3, \XMM3 2436 vaesenc \T5, \XMM4, \XMM4 2437 vaesenc \T5, \XMM5, \XMM5 2438 vaesenc \T5, \XMM6, \XMM6 2439 vaesenc \T5, \XMM7, \XMM7 2440 vaesenc \T5, \XMM8, \XMM8 2441 2442 vmovdqa TMP8(%rsp), \T1 2443 vmovdqu HashKey(arg2), \T5 2444 2445 vpclmulqdq $0x00, \T5, \T1, \T3 2446 vpxor \T3, \T7, \T7 2447 2448 vpclmulqdq $0x01, \T5, \T1, \T3 2449 vpxor \T3, \T6, \T6 2450 2451 vpclmulqdq $0x10, \T5, \T1, \T3 2452 vpxor \T3, \T6, \T6 2453 2454 vpclmulqdq $0x11, \T5, \T1, \T3 2455 vpxor \T3, \T4, \T1 2456 2457 2458 vmovdqu 16*10(arg1), \T5 2459 2460 i = 11 2461 setreg 2462.rep (\REP-9) 2463 vaesenc \T5, \XMM1, \XMM1 2464 vaesenc \T5, \XMM2, \XMM2 2465 vaesenc \T5, \XMM3, \XMM3 2466 vaesenc \T5, \XMM4, \XMM4 2467 vaesenc \T5, \XMM5, \XMM5 2468 vaesenc \T5, \XMM6, \XMM6 2469 vaesenc \T5, \XMM7, \XMM7 2470 vaesenc \T5, \XMM8, \XMM8 2471 2472 vmovdqu 16*i(arg1), \T5 2473 i = i + 1 2474 setreg 2475.endr 2476 2477 i = 0 2478 j = 1 2479 setreg 2480.rep 8 2481 vpxor 16*i(arg4, %r11), \T5, \T2 2482 .if \ENC_DEC == ENC 2483 vaesenclast \T2, reg_j, reg_j 2484 .else 2485 vaesenclast \T2, reg_j, \T3 2486 vmovdqu 16*i(arg4, %r11), reg_j 2487 vmovdqu \T3, 16*i(arg3, %r11) 2488 .endif 2489 i = (i+1) 2490 j = (j+1) 2491 setreg 2492.endr 2493 ####################################################################### 2494 2495 2496 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2497 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2498 vpxor \T3, \T7, \T7 2499 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2500 2501 2502 2503 ####################################################################### 2504 #first phase of the reduction 2505 vmovdqa POLY2(%rip), \T3 2506 2507 vpclmulqdq $0x01, \T7, \T3, \T2 2508 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2509 2510 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2511 ####################################################################### 2512 .if \ENC_DEC == ENC 2513 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2514 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2515 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2516 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2517 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2518 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2519 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2520 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2521 .endif 2522 2523 ####################################################################### 2524 #second phase of the reduction 2525 vpclmulqdq $0x00, \T7, \T3, \T2 2526 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2527 2528 vpclmulqdq $0x10, \T7, \T3, \T4 2529 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2530 2531 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2532 ####################################################################### 2533 vpxor \T4, \T1, \T1 # the result is in T1 2534 2535 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2536 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2537 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2538 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2539 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2540 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2541 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2542 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2543 2544 2545 vpxor \T1, \XMM1, \XMM1 2546 2547 2548 2549.endm 2550 2551 2552# GHASH the last 4 ciphertext blocks. 2553.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2554 2555 ## Karatsuba Method 2556 2557 vmovdqu HashKey_8(arg2), \T5 2558 2559 vpshufd $0b01001110, \XMM1, \T2 2560 vpshufd $0b01001110, \T5, \T3 2561 vpxor \XMM1, \T2, \T2 2562 vpxor \T5, \T3, \T3 2563 2564 vpclmulqdq $0x11, \T5, \XMM1, \T6 2565 vpclmulqdq $0x00, \T5, \XMM1, \T7 2566 2567 vpclmulqdq $0x00, \T3, \T2, \XMM1 2568 2569 ###################### 2570 2571 vmovdqu HashKey_7(arg2), \T5 2572 vpshufd $0b01001110, \XMM2, \T2 2573 vpshufd $0b01001110, \T5, \T3 2574 vpxor \XMM2, \T2, \T2 2575 vpxor \T5, \T3, \T3 2576 2577 vpclmulqdq $0x11, \T5, \XMM2, \T4 2578 vpxor \T4, \T6, \T6 2579 2580 vpclmulqdq $0x00, \T5, \XMM2, \T4 2581 vpxor \T4, \T7, \T7 2582 2583 vpclmulqdq $0x00, \T3, \T2, \T2 2584 2585 vpxor \T2, \XMM1, \XMM1 2586 2587 ###################### 2588 2589 vmovdqu HashKey_6(arg2), \T5 2590 vpshufd $0b01001110, \XMM3, \T2 2591 vpshufd $0b01001110, \T5, \T3 2592 vpxor \XMM3, \T2, \T2 2593 vpxor \T5, \T3, \T3 2594 2595 vpclmulqdq $0x11, \T5, \XMM3, \T4 2596 vpxor \T4, \T6, \T6 2597 2598 vpclmulqdq $0x00, \T5, \XMM3, \T4 2599 vpxor \T4, \T7, \T7 2600 2601 vpclmulqdq $0x00, \T3, \T2, \T2 2602 2603 vpxor \T2, \XMM1, \XMM1 2604 2605 ###################### 2606 2607 vmovdqu HashKey_5(arg2), \T5 2608 vpshufd $0b01001110, \XMM4, \T2 2609 vpshufd $0b01001110, \T5, \T3 2610 vpxor \XMM4, \T2, \T2 2611 vpxor \T5, \T3, \T3 2612 2613 vpclmulqdq $0x11, \T5, \XMM4, \T4 2614 vpxor \T4, \T6, \T6 2615 2616 vpclmulqdq $0x00, \T5, \XMM4, \T4 2617 vpxor \T4, \T7, \T7 2618 2619 vpclmulqdq $0x00, \T3, \T2, \T2 2620 2621 vpxor \T2, \XMM1, \XMM1 2622 2623 ###################### 2624 2625 vmovdqu HashKey_4(arg2), \T5 2626 vpshufd $0b01001110, \XMM5, \T2 2627 vpshufd $0b01001110, \T5, \T3 2628 vpxor \XMM5, \T2, \T2 2629 vpxor \T5, \T3, \T3 2630 2631 vpclmulqdq $0x11, \T5, \XMM5, \T4 2632 vpxor \T4, \T6, \T6 2633 2634 vpclmulqdq $0x00, \T5, \XMM5, \T4 2635 vpxor \T4, \T7, \T7 2636 2637 vpclmulqdq $0x00, \T3, \T2, \T2 2638 2639 vpxor \T2, \XMM1, \XMM1 2640 2641 ###################### 2642 2643 vmovdqu HashKey_3(arg2), \T5 2644 vpshufd $0b01001110, \XMM6, \T2 2645 vpshufd $0b01001110, \T5, \T3 2646 vpxor \XMM6, \T2, \T2 2647 vpxor \T5, \T3, \T3 2648 2649 vpclmulqdq $0x11, \T5, \XMM6, \T4 2650 vpxor \T4, \T6, \T6 2651 2652 vpclmulqdq $0x00, \T5, \XMM6, \T4 2653 vpxor \T4, \T7, \T7 2654 2655 vpclmulqdq $0x00, \T3, \T2, \T2 2656 2657 vpxor \T2, \XMM1, \XMM1 2658 2659 ###################### 2660 2661 vmovdqu HashKey_2(arg2), \T5 2662 vpshufd $0b01001110, \XMM7, \T2 2663 vpshufd $0b01001110, \T5, \T3 2664 vpxor \XMM7, \T2, \T2 2665 vpxor \T5, \T3, \T3 2666 2667 vpclmulqdq $0x11, \T5, \XMM7, \T4 2668 vpxor \T4, \T6, \T6 2669 2670 vpclmulqdq $0x00, \T5, \XMM7, \T4 2671 vpxor \T4, \T7, \T7 2672 2673 vpclmulqdq $0x00, \T3, \T2, \T2 2674 2675 vpxor \T2, \XMM1, \XMM1 2676 2677 ###################### 2678 2679 vmovdqu HashKey(arg2), \T5 2680 vpshufd $0b01001110, \XMM8, \T2 2681 vpshufd $0b01001110, \T5, \T3 2682 vpxor \XMM8, \T2, \T2 2683 vpxor \T5, \T3, \T3 2684 2685 vpclmulqdq $0x11, \T5, \XMM8, \T4 2686 vpxor \T4, \T6, \T6 2687 2688 vpclmulqdq $0x00, \T5, \XMM8, \T4 2689 vpxor \T4, \T7, \T7 2690 2691 vpclmulqdq $0x00, \T3, \T2, \T2 2692 2693 vpxor \T2, \XMM1, \XMM1 2694 vpxor \T6, \XMM1, \XMM1 2695 vpxor \T7, \XMM1, \T2 2696 2697 2698 2699 2700 vpslldq $8, \T2, \T4 2701 vpsrldq $8, \T2, \T2 2702 2703 vpxor \T4, \T7, \T7 2704 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2705 # accumulated carry-less multiplications 2706 2707 ####################################################################### 2708 #first phase of the reduction 2709 vmovdqa POLY2(%rip), \T3 2710 2711 vpclmulqdq $0x01, \T7, \T3, \T2 2712 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2713 2714 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2715 ####################################################################### 2716 2717 2718 #second phase of the reduction 2719 vpclmulqdq $0x00, \T7, \T3, \T2 2720 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2721 2722 vpclmulqdq $0x10, \T7, \T3, \T4 2723 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2724 2725 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2726 ####################################################################### 2727 vpxor \T4, \T6, \T6 # the result is in T6 2728.endm 2729 2730 2731 2732############################################################# 2733#void aesni_gcm_init_avx_gen4 2734# (gcm_data *my_ctx_data, 2735# gcm_context_data *data, 2736# u8 *iv, /* Pre-counter block j0: 4 byte salt 2737# (from Security Association) concatenated with 8 byte 2738# Initialisation Vector (from IPSec ESP Payload) 2739# concatenated with 0x00000001. 16-byte aligned pointer. */ 2740# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2741# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2742# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2743############################################################# 2744SYM_FUNC_START(aesni_gcm_init_avx_gen4) 2745 FUNC_SAVE 2746 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2747 FUNC_RESTORE 2748 ret 2749SYM_FUNC_END(aesni_gcm_init_avx_gen4) 2750 2751############################################################################### 2752#void aesni_gcm_enc_avx_gen4( 2753# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2754# gcm_context_data *data, 2755# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2756# const u8 *in, /* Plaintext input */ 2757# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2758############################################################################### 2759SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) 2760 FUNC_SAVE 2761 mov keysize,%eax 2762 cmp $32, %eax 2763 je key_256_enc_update4 2764 cmp $16, %eax 2765 je key_128_enc_update4 2766 # must be 192 2767 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2768 FUNC_RESTORE 2769 ret 2770key_128_enc_update4: 2771 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2772 FUNC_RESTORE 2773 ret 2774key_256_enc_update4: 2775 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2776 FUNC_RESTORE 2777 ret 2778SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) 2779 2780############################################################################### 2781#void aesni_gcm_dec_update_avx_gen4( 2782# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2783# gcm_context_data *data, 2784# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2785# const u8 *in, /* Ciphertext input */ 2786# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2787############################################################################### 2788SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) 2789 FUNC_SAVE 2790 mov keysize,%eax 2791 cmp $32, %eax 2792 je key_256_dec_update4 2793 cmp $16, %eax 2794 je key_128_dec_update4 2795 # must be 192 2796 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2797 FUNC_RESTORE 2798 ret 2799key_128_dec_update4: 2800 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2801 FUNC_RESTORE 2802 ret 2803key_256_dec_update4: 2804 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2805 FUNC_RESTORE 2806 ret 2807SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) 2808 2809############################################################################### 2810#void aesni_gcm_finalize_avx_gen4( 2811# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2812# gcm_context_data *data, 2813# u8 *auth_tag, /* Authenticated Tag output. */ 2814# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2815# Valid values are 16 (most likely), 12 or 8. */ 2816############################################################################### 2817SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) 2818 FUNC_SAVE 2819 mov keysize,%eax 2820 cmp $32, %eax 2821 je key_256_finalize4 2822 cmp $16, %eax 2823 je key_128_finalize4 2824 # must be 192 2825 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2826 FUNC_RESTORE 2827 ret 2828key_128_finalize4: 2829 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2830 FUNC_RESTORE 2831 ret 2832key_256_finalize4: 2833 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2834 FUNC_RESTORE 2835 ret 2836SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) 2837