1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123#include <asm/inst.h> 124 125# constants in mergeable sections, linker can reorder and merge 126.section .rodata.cst16.POLY, "aM", @progbits, 16 127.align 16 128POLY: .octa 0xC2000000000000000000000000000001 129 130.section .rodata.cst16.POLY2, "aM", @progbits, 16 131.align 16 132POLY2: .octa 0xC20000000000000000000001C2000000 133 134.section .rodata.cst16.TWOONE, "aM", @progbits, 16 135.align 16 136TWOONE: .octa 0x00000001000000000000000000000001 137 138.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 139.align 16 140SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 141 142.section .rodata.cst16.ONE, "aM", @progbits, 16 143.align 16 144ONE: .octa 0x00000000000000000000000000000001 145 146.section .rodata.cst16.ONEf, "aM", @progbits, 16 147.align 16 148ONEf: .octa 0x01000000000000000000000000000000 149 150# order of these constants should not change. 151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 152.section .rodata, "a", @progbits 153.align 16 154SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 155ALL_F: .octa 0xffffffffffffffffffffffffffffffff 156 .octa 0x00000000000000000000000000000000 157 158.section .rodata 159.align 16 160.type aad_shift_arr, @object 161.size aad_shift_arr, 272 162aad_shift_arr: 163 .octa 0xffffffffffffffffffffffffffffffff 164 .octa 0xffffffffffffffffffffffffffffff0C 165 .octa 0xffffffffffffffffffffffffffff0D0C 166 .octa 0xffffffffffffffffffffffffff0E0D0C 167 .octa 0xffffffffffffffffffffffff0F0E0D0C 168 .octa 0xffffffffffffffffffffff0C0B0A0908 169 .octa 0xffffffffffffffffffff0D0C0B0A0908 170 .octa 0xffffffffffffffffff0E0D0C0B0A0908 171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 172 .octa 0xffffffffffffff0C0B0A090807060504 173 .octa 0xffffffffffff0D0C0B0A090807060504 174 .octa 0xffffffffff0E0D0C0B0A090807060504 175 .octa 0xffffffff0F0E0D0C0B0A090807060504 176 .octa 0xffffff0C0B0A09080706050403020100 177 .octa 0xffff0D0C0B0A09080706050403020100 178 .octa 0xff0E0D0C0B0A09080706050403020100 179 .octa 0x0F0E0D0C0B0A09080706050403020100 180 181 182.text 183 184 185#define AadHash 16*0 186#define AadLen 16*1 187#define InLen (16*1)+8 188#define PBlockEncKey 16*2 189#define OrigIV 16*3 190#define CurCount 16*4 191#define PBlockLen 16*5 192 193HashKey = 16*6 # store HashKey <<1 mod poly here 194HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 195HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 196HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 197HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 198HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 199HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 200HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 201HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 202HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 203HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 204HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 205HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 206HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 207HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 208HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 209 210#define arg1 %rdi 211#define arg2 %rsi 212#define arg3 %rdx 213#define arg4 %rcx 214#define arg5 %r8 215#define arg6 %r9 216#define arg7 STACK_OFFSET+8*1(%r14) 217#define arg8 STACK_OFFSET+8*2(%r14) 218#define arg9 STACK_OFFSET+8*3(%r14) 219#define arg10 STACK_OFFSET+8*4(%r14) 220#define keysize 2*15*16(arg1) 221 222i = 0 223j = 0 224 225out_order = 0 226in_order = 1 227DEC = 0 228ENC = 1 229 230.macro define_reg r n 231reg_\r = %xmm\n 232.endm 233 234.macro setreg 235.altmacro 236define_reg i %i 237define_reg j %j 238.noaltmacro 239.endm 240 241# need to push 4 registers into stack to maintain 242STACK_OFFSET = 8*4 243 244TMP1 = 16*0 # Temporary storage for AAD 245TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 246TMP3 = 16*2 # Temporary storage for AES State 3 247TMP4 = 16*3 # Temporary storage for AES State 4 248TMP5 = 16*4 # Temporary storage for AES State 5 249TMP6 = 16*5 # Temporary storage for AES State 6 250TMP7 = 16*6 # Temporary storage for AES State 7 251TMP8 = 16*7 # Temporary storage for AES State 8 252 253VARIABLE_OFFSET = 16*8 254 255################################ 256# Utility Macros 257################################ 258 259.macro FUNC_SAVE 260 #the number of pushes must equal STACK_OFFSET 261 push %r12 262 push %r13 263 push %r14 264 push %r15 265 266 mov %rsp, %r14 267 268 269 270 sub $VARIABLE_OFFSET, %rsp 271 and $~63, %rsp # align rsp to 64 bytes 272.endm 273 274.macro FUNC_RESTORE 275 mov %r14, %rsp 276 277 pop %r15 278 pop %r14 279 pop %r13 280 pop %r12 281.endm 282 283# Encryption of a single block 284.macro ENCRYPT_SINGLE_BLOCK REP XMM0 285 vpxor (arg1), \XMM0, \XMM0 286 i = 1 287 setreg 288.rep \REP 289 vaesenc 16*i(arg1), \XMM0, \XMM0 290 i = (i+1) 291 setreg 292.endr 293 vaesenclast 16*i(arg1), \XMM0, \XMM0 294.endm 295 296# combined for GCM encrypt and decrypt functions 297# clobbering all xmm registers 298# clobbering r10, r11, r12, r13, r14, r15 299.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 300 vmovdqu AadHash(arg2), %xmm8 301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 302 add arg5, InLen(arg2) 303 304 # initialize the data pointer offset as zero 305 xor %r11d, %r11d 306 307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 308 sub %r11, arg5 309 310 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 311 and $-16, %r13 # r13 = r13 - (r13 mod 16) 312 313 mov %r13, %r12 314 shr $4, %r12 315 and $7, %r12 316 jz _initial_num_blocks_is_0\@ 317 318 cmp $7, %r12 319 je _initial_num_blocks_is_7\@ 320 cmp $6, %r12 321 je _initial_num_blocks_is_6\@ 322 cmp $5, %r12 323 je _initial_num_blocks_is_5\@ 324 cmp $4, %r12 325 je _initial_num_blocks_is_4\@ 326 cmp $3, %r12 327 je _initial_num_blocks_is_3\@ 328 cmp $2, %r12 329 je _initial_num_blocks_is_2\@ 330 331 jmp _initial_num_blocks_is_1\@ 332 333_initial_num_blocks_is_7\@: 334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 335 sub $16*7, %r13 336 jmp _initial_blocks_encrypted\@ 337 338_initial_num_blocks_is_6\@: 339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 340 sub $16*6, %r13 341 jmp _initial_blocks_encrypted\@ 342 343_initial_num_blocks_is_5\@: 344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 345 sub $16*5, %r13 346 jmp _initial_blocks_encrypted\@ 347 348_initial_num_blocks_is_4\@: 349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 350 sub $16*4, %r13 351 jmp _initial_blocks_encrypted\@ 352 353_initial_num_blocks_is_3\@: 354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 355 sub $16*3, %r13 356 jmp _initial_blocks_encrypted\@ 357 358_initial_num_blocks_is_2\@: 359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 360 sub $16*2, %r13 361 jmp _initial_blocks_encrypted\@ 362 363_initial_num_blocks_is_1\@: 364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 365 sub $16*1, %r13 366 jmp _initial_blocks_encrypted\@ 367 368_initial_num_blocks_is_0\@: 369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 370 371 372_initial_blocks_encrypted\@: 373 cmp $0, %r13 374 je _zero_cipher_left\@ 375 376 sub $128, %r13 377 je _eight_cipher_left\@ 378 379 380 381 382 vmovd %xmm9, %r15d 383 and $255, %r15d 384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 385 386 387_encrypt_by_8_new\@: 388 cmp $(255-8), %r15d 389 jg _encrypt_by_8\@ 390 391 392 393 add $8, %r15b 394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 395 add $128, %r11 396 sub $128, %r13 397 jne _encrypt_by_8_new\@ 398 399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 400 jmp _eight_cipher_left\@ 401 402_encrypt_by_8\@: 403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 404 add $8, %r15b 405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 407 add $128, %r11 408 sub $128, %r13 409 jne _encrypt_by_8_new\@ 410 411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 412 413 414 415 416_eight_cipher_left\@: 417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 418 419 420_zero_cipher_left\@: 421 vmovdqu %xmm14, AadHash(arg2) 422 vmovdqu %xmm9, CurCount(arg2) 423 424 # check for 0 length 425 mov arg5, %r13 426 and $15, %r13 # r13 = (arg5 mod 16) 427 428 je _multiple_of_16_bytes\@ 429 430 # handle the last <16 Byte block separately 431 432 mov %r13, PBlockLen(arg2) 433 434 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 435 vmovdqu %xmm9, CurCount(arg2) 436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 437 438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 439 vmovdqu %xmm9, PBlockEncKey(arg2) 440 441 cmp $16, arg5 442 jge _large_enough_update\@ 443 444 lea (arg4,%r11,1), %r10 445 mov %r13, %r12 446 447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 448 449 lea SHIFT_MASK+16(%rip), %r12 450 sub %r13, %r12 # adjust the shuffle mask pointer to be 451 # able to shift 16-r13 bytes (r13 is the 452 # number of bytes in plaintext mod 16) 453 454 jmp _final_ghash_mul\@ 455 456_large_enough_update\@: 457 sub $16, %r11 458 add %r13, %r11 459 460 # receive the last <16 Byte block 461 vmovdqu (arg4, %r11, 1), %xmm1 462 463 sub %r13, %r11 464 add $16, %r11 465 466 lea SHIFT_MASK+16(%rip), %r12 467 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 468 # (r13 is the number of bytes in plaintext mod 16) 469 sub %r13, %r12 470 # get the appropriate shuffle mask 471 vmovdqu (%r12), %xmm2 472 # shift right 16-r13 bytes 473 vpshufb %xmm2, %xmm1, %xmm1 474 475_final_ghash_mul\@: 476 .if \ENC_DEC == DEC 477 vmovdqa %xmm1, %xmm2 478 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 480 # mask out top 16-r13 bytes of xmm9 481 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 482 vpand %xmm1, %xmm2, %xmm2 483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 484 vpxor %xmm2, %xmm14, %xmm14 485 486 vmovdqu %xmm14, AadHash(arg2) 487 .else 488 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 490 # mask out top 16-r13 bytes of xmm9 491 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 493 vpxor %xmm9, %xmm14, %xmm14 494 495 vmovdqu %xmm14, AadHash(arg2) 496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 497 .endif 498 499 500 ############################# 501 # output r13 Bytes 502 vmovq %xmm9, %rax 503 cmp $8, %r13 504 jle _less_than_8_bytes_left\@ 505 506 mov %rax, (arg3 , %r11) 507 add $8, %r11 508 vpsrldq $8, %xmm9, %xmm9 509 vmovq %xmm9, %rax 510 sub $8, %r13 511 512_less_than_8_bytes_left\@: 513 movb %al, (arg3 , %r11) 514 add $1, %r11 515 shr $8, %rax 516 sub $1, %r13 517 jne _less_than_8_bytes_left\@ 518 ############################# 519 520_multiple_of_16_bytes\@: 521.endm 522 523 524# GCM_COMPLETE Finishes update of tag of last partial block 525# Output: Authorization Tag (AUTH_TAG) 526# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 527.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 528 vmovdqu AadHash(arg2), %xmm14 529 vmovdqu HashKey(arg2), %xmm13 530 531 mov PBlockLen(arg2), %r12 532 cmp $0, %r12 533 je _partial_done\@ 534 535 #GHASH computation for the last <16 Byte block 536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 537 538_partial_done\@: 539 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 540 shl $3, %r12 # convert into number of bits 541 vmovd %r12d, %xmm15 # len(A) in xmm15 542 543 mov InLen(arg2), %r12 544 shl $3, %r12 # len(C) in bits (*128) 545 vmovq %r12, %xmm1 546 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 547 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 548 549 vpxor %xmm15, %xmm14, %xmm14 550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 552 553 vmovdqu OrigIV(arg2), %xmm9 554 555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 556 557 vpxor %xmm14, %xmm9, %xmm9 558 559 560 561_return_T\@: 562 mov \AUTH_TAG, %r10 # r10 = authTag 563 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 564 565 cmp $16, %r11 566 je _T_16\@ 567 568 cmp $8, %r11 569 jl _T_4\@ 570 571_T_8\@: 572 vmovq %xmm9, %rax 573 mov %rax, (%r10) 574 add $8, %r10 575 sub $8, %r11 576 vpsrldq $8, %xmm9, %xmm9 577 cmp $0, %r11 578 je _return_T_done\@ 579_T_4\@: 580 vmovd %xmm9, %eax 581 mov %eax, (%r10) 582 add $4, %r10 583 sub $4, %r11 584 vpsrldq $4, %xmm9, %xmm9 585 cmp $0, %r11 586 je _return_T_done\@ 587_T_123\@: 588 vmovd %xmm9, %eax 589 cmp $2, %r11 590 jl _T_1\@ 591 mov %ax, (%r10) 592 cmp $2, %r11 593 je _return_T_done\@ 594 add $2, %r10 595 sar $16, %eax 596_T_1\@: 597 mov %al, (%r10) 598 jmp _return_T_done\@ 599 600_T_16\@: 601 vmovdqu %xmm9, (%r10) 602 603_return_T_done\@: 604.endm 605 606.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 607 608 mov \AAD, %r10 # r10 = AAD 609 mov \AADLEN, %r12 # r12 = aadLen 610 611 612 mov %r12, %r11 613 614 vpxor \T8, \T8, \T8 615 vpxor \T7, \T7, \T7 616 cmp $16, %r11 617 jl _get_AAD_rest8\@ 618_get_AAD_blocks\@: 619 vmovdqu (%r10), \T7 620 vpshufb SHUF_MASK(%rip), \T7, \T7 621 vpxor \T7, \T8, \T8 622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 623 add $16, %r10 624 sub $16, %r12 625 sub $16, %r11 626 cmp $16, %r11 627 jge _get_AAD_blocks\@ 628 vmovdqu \T8, \T7 629 cmp $0, %r11 630 je _get_AAD_done\@ 631 632 vpxor \T7, \T7, \T7 633 634 /* read the last <16B of AAD. since we have at least 4B of 635 data right after the AAD (the ICV, and maybe some CT), we can 636 read 4B/8B blocks safely, and then get rid of the extra stuff */ 637_get_AAD_rest8\@: 638 cmp $4, %r11 639 jle _get_AAD_rest4\@ 640 movq (%r10), \T1 641 add $8, %r10 642 sub $8, %r11 643 vpslldq $8, \T1, \T1 644 vpsrldq $8, \T7, \T7 645 vpxor \T1, \T7, \T7 646 jmp _get_AAD_rest8\@ 647_get_AAD_rest4\@: 648 cmp $0, %r11 649 jle _get_AAD_rest0\@ 650 mov (%r10), %eax 651 movq %rax, \T1 652 add $4, %r10 653 sub $4, %r11 654 vpslldq $12, \T1, \T1 655 vpsrldq $4, \T7, \T7 656 vpxor \T1, \T7, \T7 657_get_AAD_rest0\@: 658 /* finalize: shift out the extra bytes we read, and align 659 left. since pslldq can only shift by an immediate, we use 660 vpshufb and an array of shuffle masks */ 661 movq %r12, %r11 662 salq $4, %r11 663 vmovdqu aad_shift_arr(%r11), \T1 664 vpshufb \T1, \T7, \T7 665_get_AAD_rest_final\@: 666 vpshufb SHUF_MASK(%rip), \T7, \T7 667 vpxor \T8, \T7, \T7 668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 669 670_get_AAD_done\@: 671 vmovdqu \T7, AadHash(arg2) 672.endm 673 674.macro INIT GHASH_MUL PRECOMPUTE 675 mov arg6, %r11 676 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 677 xor %r11d, %r11d 678 mov %r11, InLen(arg2) # ctx_data.in_length = 0 679 680 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 681 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 682 mov arg3, %rax 683 movdqu (%rax), %xmm0 684 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 685 686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 687 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 688 689 vmovdqu (arg4), %xmm6 # xmm6 = HashKey 690 691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 692 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 693 vmovdqa %xmm6, %xmm2 694 vpsllq $1, %xmm6, %xmm6 695 vpsrlq $63, %xmm2, %xmm2 696 vmovdqa %xmm2, %xmm1 697 vpslldq $8, %xmm2, %xmm2 698 vpsrldq $8, %xmm1, %xmm1 699 vpor %xmm2, %xmm6, %xmm6 700 #reduction 701 vpshufd $0b00100100, %xmm1, %xmm2 702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 703 vpand POLY(%rip), %xmm2, %xmm2 704 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 705 ####################################################################### 706 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 707 708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 709 710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 711.endm 712 713 714# Reads DLEN bytes starting at DPTR and stores in XMMDst 715# where 0 < DLEN < 16 716# Clobbers %rax, DLEN 717.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 718 vpxor \XMMDst, \XMMDst, \XMMDst 719 720 cmp $8, \DLEN 721 jl _read_lt8_\@ 722 mov (\DPTR), %rax 723 vpinsrq $0, %rax, \XMMDst, \XMMDst 724 sub $8, \DLEN 725 jz _done_read_partial_block_\@ 726 xor %eax, %eax 727_read_next_byte_\@: 728 shl $8, %rax 729 mov 7(\DPTR, \DLEN, 1), %al 730 dec \DLEN 731 jnz _read_next_byte_\@ 732 vpinsrq $1, %rax, \XMMDst, \XMMDst 733 jmp _done_read_partial_block_\@ 734_read_lt8_\@: 735 xor %eax, %eax 736_read_next_byte_lt8_\@: 737 shl $8, %rax 738 mov -1(\DPTR, \DLEN, 1), %al 739 dec \DLEN 740 jnz _read_next_byte_lt8_\@ 741 vpinsrq $0, %rax, \XMMDst, \XMMDst 742_done_read_partial_block_\@: 743.endm 744 745# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 746# between update calls. 747# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 748# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 749# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 750.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 751 AAD_HASH ENC_DEC 752 mov PBlockLen(arg2), %r13 753 cmp $0, %r13 754 je _partial_block_done_\@ # Leave Macro if no partial blocks 755 # Read in input data without over reading 756 cmp $16, \PLAIN_CYPH_LEN 757 jl _fewer_than_16_bytes_\@ 758 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 759 jmp _data_read_\@ 760 761_fewer_than_16_bytes_\@: 762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 763 mov \PLAIN_CYPH_LEN, %r12 764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 765 766 mov PBlockLen(arg2), %r13 767 768_data_read_\@: # Finished reading in data 769 770 vmovdqu PBlockEncKey(arg2), %xmm9 771 vmovdqu HashKey(arg2), %xmm13 772 773 lea SHIFT_MASK(%rip), %r12 774 775 # adjust the shuffle mask pointer to be able to shift r13 bytes 776 # r16-r13 is the number of bytes in plaintext mod 16) 777 add %r13, %r12 778 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 779 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 780 781.if \ENC_DEC == DEC 782 vmovdqa %xmm1, %xmm3 783 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 784 785 mov \PLAIN_CYPH_LEN, %r10 786 add %r13, %r10 787 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 788 sub $16, %r10 789 # Determine if if partial block is not being filled and 790 # shift mask accordingly 791 jge _no_extra_mask_1_\@ 792 sub %r10, %r12 793_no_extra_mask_1_\@: 794 795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 796 # get the appropriate mask to mask out bottom r13 bytes of xmm9 797 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 798 799 vpand %xmm1, %xmm3, %xmm3 800 vmovdqa SHUF_MASK(%rip), %xmm10 801 vpshufb %xmm10, %xmm3, %xmm3 802 vpshufb %xmm2, %xmm3, %xmm3 803 vpxor %xmm3, \AAD_HASH, \AAD_HASH 804 805 cmp $0, %r10 806 jl _partial_incomplete_1_\@ 807 808 # GHASH computation for the last <16 Byte block 809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 810 xor %eax,%eax 811 812 mov %rax, PBlockLen(arg2) 813 jmp _dec_done_\@ 814_partial_incomplete_1_\@: 815 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 816_dec_done_\@: 817 vmovdqu \AAD_HASH, AadHash(arg2) 818.else 819 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 820 821 mov \PLAIN_CYPH_LEN, %r10 822 add %r13, %r10 823 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 824 sub $16, %r10 825 # Determine if if partial block is not being filled and 826 # shift mask accordingly 827 jge _no_extra_mask_2_\@ 828 sub %r10, %r12 829_no_extra_mask_2_\@: 830 831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 832 # get the appropriate mask to mask out bottom r13 bytes of xmm9 833 vpand %xmm1, %xmm9, %xmm9 834 835 vmovdqa SHUF_MASK(%rip), %xmm1 836 vpshufb %xmm1, %xmm9, %xmm9 837 vpshufb %xmm2, %xmm9, %xmm9 838 vpxor %xmm9, \AAD_HASH, \AAD_HASH 839 840 cmp $0, %r10 841 jl _partial_incomplete_2_\@ 842 843 # GHASH computation for the last <16 Byte block 844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 845 xor %eax,%eax 846 847 mov %rax, PBlockLen(arg2) 848 jmp _encode_done_\@ 849_partial_incomplete_2_\@: 850 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 851_encode_done_\@: 852 vmovdqu \AAD_HASH, AadHash(arg2) 853 854 vmovdqa SHUF_MASK(%rip), %xmm10 855 # shuffle xmm9 back to output as ciphertext 856 vpshufb %xmm10, %xmm9, %xmm9 857 vpshufb %xmm2, %xmm9, %xmm9 858.endif 859 # output encrypted Bytes 860 cmp $0, %r10 861 jl _partial_fill_\@ 862 mov %r13, %r12 863 mov $16, %r13 864 # Set r13 to be the number of bytes to write out 865 sub %r12, %r13 866 jmp _count_set_\@ 867_partial_fill_\@: 868 mov \PLAIN_CYPH_LEN, %r13 869_count_set_\@: 870 vmovdqa %xmm9, %xmm0 871 vmovq %xmm0, %rax 872 cmp $8, %r13 873 jle _less_than_8_bytes_left_\@ 874 875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 876 add $8, \DATA_OFFSET 877 psrldq $8, %xmm0 878 vmovq %xmm0, %rax 879 sub $8, %r13 880_less_than_8_bytes_left_\@: 881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 882 add $1, \DATA_OFFSET 883 shr $8, %rax 884 sub $1, %r13 885 jne _less_than_8_bytes_left_\@ 886_partial_block_done_\@: 887.endm # PARTIAL_BLOCK 888 889#ifdef CONFIG_AS_AVX 890############################################################################### 891# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 892# Input: A and B (128-bits each, bit-reflected) 893# Output: C = A*B*x mod poly, (i.e. >>1 ) 894# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 895# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 896############################################################################### 897.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 898 899 vpshufd $0b01001110, \GH, \T2 900 vpshufd $0b01001110, \HK, \T3 901 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 902 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 903 904 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 905 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 906 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 907 vpxor \GH, \T2,\T2 908 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 909 910 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 911 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 912 vpxor \T3, \GH, \GH 913 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 914 915 #first phase of the reduction 916 vpslld $31, \GH, \T2 # packed right shifting << 31 917 vpslld $30, \GH, \T3 # packed right shifting shift << 30 918 vpslld $25, \GH, \T4 # packed right shifting shift << 25 919 920 vpxor \T3, \T2, \T2 # xor the shifted versions 921 vpxor \T4, \T2, \T2 922 923 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 924 925 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 926 vpxor \T2, \GH, \GH # first phase of the reduction complete 927 928 #second phase of the reduction 929 930 vpsrld $1,\GH, \T2 # packed left shifting >> 1 931 vpsrld $2,\GH, \T3 # packed left shifting >> 2 932 vpsrld $7,\GH, \T4 # packed left shifting >> 7 933 vpxor \T3, \T2, \T2 # xor the shifted versions 934 vpxor \T4, \T2, \T2 935 936 vpxor \T5, \T2, \T2 937 vpxor \T2, \GH, \GH 938 vpxor \T1, \GH, \GH # the result is in GH 939 940 941.endm 942 943.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 944 945 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 946 vmovdqa \HK, \T5 947 948 vpshufd $0b01001110, \T5, \T1 949 vpxor \T5, \T1, \T1 950 vmovdqu \T1, HashKey_k(arg2) 951 952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 953 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 954 vpshufd $0b01001110, \T5, \T1 955 vpxor \T5, \T1, \T1 956 vmovdqu \T1, HashKey_2_k(arg2) 957 958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 959 vmovdqu \T5, HashKey_3(arg2) 960 vpshufd $0b01001110, \T5, \T1 961 vpxor \T5, \T1, \T1 962 vmovdqu \T1, HashKey_3_k(arg2) 963 964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 965 vmovdqu \T5, HashKey_4(arg2) 966 vpshufd $0b01001110, \T5, \T1 967 vpxor \T5, \T1, \T1 968 vmovdqu \T1, HashKey_4_k(arg2) 969 970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 971 vmovdqu \T5, HashKey_5(arg2) 972 vpshufd $0b01001110, \T5, \T1 973 vpxor \T5, \T1, \T1 974 vmovdqu \T1, HashKey_5_k(arg2) 975 976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 977 vmovdqu \T5, HashKey_6(arg2) 978 vpshufd $0b01001110, \T5, \T1 979 vpxor \T5, \T1, \T1 980 vmovdqu \T1, HashKey_6_k(arg2) 981 982 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 983 vmovdqu \T5, HashKey_7(arg2) 984 vpshufd $0b01001110, \T5, \T1 985 vpxor \T5, \T1, \T1 986 vmovdqu \T1, HashKey_7_k(arg2) 987 988 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 989 vmovdqu \T5, HashKey_8(arg2) 990 vpshufd $0b01001110, \T5, \T1 991 vpxor \T5, \T1, \T1 992 vmovdqu \T1, HashKey_8_k(arg2) 993 994.endm 995 996## if a = number of total plaintext bytes 997## b = floor(a/16) 998## num_initial_blocks = b mod 4# 999## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1000## r10, r11, r12, rax are clobbered 1001## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1002 1003.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 1004 i = (8-\num_initial_blocks) 1005 setreg 1006 vmovdqu AadHash(arg2), reg_i 1007 1008 # start AES for num_initial_blocks blocks 1009 vmovdqu CurCount(arg2), \CTR 1010 1011 i = (9-\num_initial_blocks) 1012 setreg 1013.rep \num_initial_blocks 1014 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1015 vmovdqa \CTR, reg_i 1016 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1017 i = (i+1) 1018 setreg 1019.endr 1020 1021 vmovdqa (arg1), \T_key 1022 i = (9-\num_initial_blocks) 1023 setreg 1024.rep \num_initial_blocks 1025 vpxor \T_key, reg_i, reg_i 1026 i = (i+1) 1027 setreg 1028.endr 1029 1030 j = 1 1031 setreg 1032.rep \REP 1033 vmovdqa 16*j(arg1), \T_key 1034 i = (9-\num_initial_blocks) 1035 setreg 1036.rep \num_initial_blocks 1037 vaesenc \T_key, reg_i, reg_i 1038 i = (i+1) 1039 setreg 1040.endr 1041 1042 j = (j+1) 1043 setreg 1044.endr 1045 1046 vmovdqa 16*j(arg1), \T_key 1047 i = (9-\num_initial_blocks) 1048 setreg 1049.rep \num_initial_blocks 1050 vaesenclast \T_key, reg_i, reg_i 1051 i = (i+1) 1052 setreg 1053.endr 1054 1055 i = (9-\num_initial_blocks) 1056 setreg 1057.rep \num_initial_blocks 1058 vmovdqu (arg4, %r11), \T1 1059 vpxor \T1, reg_i, reg_i 1060 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1061 add $16, %r11 1062.if \ENC_DEC == DEC 1063 vmovdqa \T1, reg_i 1064.endif 1065 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1066 i = (i+1) 1067 setreg 1068.endr 1069 1070 1071 i = (8-\num_initial_blocks) 1072 j = (9-\num_initial_blocks) 1073 setreg 1074 1075.rep \num_initial_blocks 1076 vpxor reg_i, reg_j, reg_j 1077 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1078 i = (i+1) 1079 j = (j+1) 1080 setreg 1081.endr 1082 # XMM8 has the combined result here 1083 1084 vmovdqa \XMM8, TMP1(%rsp) 1085 vmovdqa \XMM8, \T3 1086 1087 cmp $128, %r13 1088 jl _initial_blocks_done\@ # no need for precomputed constants 1089 1090############################################################################### 1091# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1093 vmovdqa \CTR, \XMM1 1094 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1095 1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1097 vmovdqa \CTR, \XMM2 1098 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1099 1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1101 vmovdqa \CTR, \XMM3 1102 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1103 1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1105 vmovdqa \CTR, \XMM4 1106 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1107 1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1109 vmovdqa \CTR, \XMM5 1110 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1111 1112 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1113 vmovdqa \CTR, \XMM6 1114 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1115 1116 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1117 vmovdqa \CTR, \XMM7 1118 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1119 1120 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1121 vmovdqa \CTR, \XMM8 1122 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1123 1124 vmovdqa (arg1), \T_key 1125 vpxor \T_key, \XMM1, \XMM1 1126 vpxor \T_key, \XMM2, \XMM2 1127 vpxor \T_key, \XMM3, \XMM3 1128 vpxor \T_key, \XMM4, \XMM4 1129 vpxor \T_key, \XMM5, \XMM5 1130 vpxor \T_key, \XMM6, \XMM6 1131 vpxor \T_key, \XMM7, \XMM7 1132 vpxor \T_key, \XMM8, \XMM8 1133 1134 i = 1 1135 setreg 1136.rep \REP # do REP rounds 1137 vmovdqa 16*i(arg1), \T_key 1138 vaesenc \T_key, \XMM1, \XMM1 1139 vaesenc \T_key, \XMM2, \XMM2 1140 vaesenc \T_key, \XMM3, \XMM3 1141 vaesenc \T_key, \XMM4, \XMM4 1142 vaesenc \T_key, \XMM5, \XMM5 1143 vaesenc \T_key, \XMM6, \XMM6 1144 vaesenc \T_key, \XMM7, \XMM7 1145 vaesenc \T_key, \XMM8, \XMM8 1146 i = (i+1) 1147 setreg 1148.endr 1149 1150 vmovdqa 16*i(arg1), \T_key 1151 vaesenclast \T_key, \XMM1, \XMM1 1152 vaesenclast \T_key, \XMM2, \XMM2 1153 vaesenclast \T_key, \XMM3, \XMM3 1154 vaesenclast \T_key, \XMM4, \XMM4 1155 vaesenclast \T_key, \XMM5, \XMM5 1156 vaesenclast \T_key, \XMM6, \XMM6 1157 vaesenclast \T_key, \XMM7, \XMM7 1158 vaesenclast \T_key, \XMM8, \XMM8 1159 1160 vmovdqu (arg4, %r11), \T1 1161 vpxor \T1, \XMM1, \XMM1 1162 vmovdqu \XMM1, (arg3 , %r11) 1163 .if \ENC_DEC == DEC 1164 vmovdqa \T1, \XMM1 1165 .endif 1166 1167 vmovdqu 16*1(arg4, %r11), \T1 1168 vpxor \T1, \XMM2, \XMM2 1169 vmovdqu \XMM2, 16*1(arg3 , %r11) 1170 .if \ENC_DEC == DEC 1171 vmovdqa \T1, \XMM2 1172 .endif 1173 1174 vmovdqu 16*2(arg4, %r11), \T1 1175 vpxor \T1, \XMM3, \XMM3 1176 vmovdqu \XMM3, 16*2(arg3 , %r11) 1177 .if \ENC_DEC == DEC 1178 vmovdqa \T1, \XMM3 1179 .endif 1180 1181 vmovdqu 16*3(arg4, %r11), \T1 1182 vpxor \T1, \XMM4, \XMM4 1183 vmovdqu \XMM4, 16*3(arg3 , %r11) 1184 .if \ENC_DEC == DEC 1185 vmovdqa \T1, \XMM4 1186 .endif 1187 1188 vmovdqu 16*4(arg4, %r11), \T1 1189 vpxor \T1, \XMM5, \XMM5 1190 vmovdqu \XMM5, 16*4(arg3 , %r11) 1191 .if \ENC_DEC == DEC 1192 vmovdqa \T1, \XMM5 1193 .endif 1194 1195 vmovdqu 16*5(arg4, %r11), \T1 1196 vpxor \T1, \XMM6, \XMM6 1197 vmovdqu \XMM6, 16*5(arg3 , %r11) 1198 .if \ENC_DEC == DEC 1199 vmovdqa \T1, \XMM6 1200 .endif 1201 1202 vmovdqu 16*6(arg4, %r11), \T1 1203 vpxor \T1, \XMM7, \XMM7 1204 vmovdqu \XMM7, 16*6(arg3 , %r11) 1205 .if \ENC_DEC == DEC 1206 vmovdqa \T1, \XMM7 1207 .endif 1208 1209 vmovdqu 16*7(arg4, %r11), \T1 1210 vpxor \T1, \XMM8, \XMM8 1211 vmovdqu \XMM8, 16*7(arg3 , %r11) 1212 .if \ENC_DEC == DEC 1213 vmovdqa \T1, \XMM8 1214 .endif 1215 1216 add $128, %r11 1217 1218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1219 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1220 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1221 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1222 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1223 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1224 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1225 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1226 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1227 1228############################################################################### 1229 1230_initial_blocks_done\@: 1231 1232.endm 1233 1234# encrypt 8 blocks at a time 1235# ghash the 8 previously encrypted ciphertext blocks 1236# arg1, arg3, arg4 are used as pointers only, not modified 1237# r11 is the data offset value 1238.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1239 1240 vmovdqa \XMM1, \T2 1241 vmovdqa \XMM2, TMP2(%rsp) 1242 vmovdqa \XMM3, TMP3(%rsp) 1243 vmovdqa \XMM4, TMP4(%rsp) 1244 vmovdqa \XMM5, TMP5(%rsp) 1245 vmovdqa \XMM6, TMP6(%rsp) 1246 vmovdqa \XMM7, TMP7(%rsp) 1247 vmovdqa \XMM8, TMP8(%rsp) 1248 1249.if \loop_idx == in_order 1250 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1251 vpaddd ONE(%rip), \XMM1, \XMM2 1252 vpaddd ONE(%rip), \XMM2, \XMM3 1253 vpaddd ONE(%rip), \XMM3, \XMM4 1254 vpaddd ONE(%rip), \XMM4, \XMM5 1255 vpaddd ONE(%rip), \XMM5, \XMM6 1256 vpaddd ONE(%rip), \XMM6, \XMM7 1257 vpaddd ONE(%rip), \XMM7, \XMM8 1258 vmovdqa \XMM8, \CTR 1259 1260 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1261 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1262 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1263 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1264 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1265 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1266 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1267 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1268.else 1269 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1270 vpaddd ONEf(%rip), \XMM1, \XMM2 1271 vpaddd ONEf(%rip), \XMM2, \XMM3 1272 vpaddd ONEf(%rip), \XMM3, \XMM4 1273 vpaddd ONEf(%rip), \XMM4, \XMM5 1274 vpaddd ONEf(%rip), \XMM5, \XMM6 1275 vpaddd ONEf(%rip), \XMM6, \XMM7 1276 vpaddd ONEf(%rip), \XMM7, \XMM8 1277 vmovdqa \XMM8, \CTR 1278.endif 1279 1280 1281 ####################################################################### 1282 1283 vmovdqu (arg1), \T1 1284 vpxor \T1, \XMM1, \XMM1 1285 vpxor \T1, \XMM2, \XMM2 1286 vpxor \T1, \XMM3, \XMM3 1287 vpxor \T1, \XMM4, \XMM4 1288 vpxor \T1, \XMM5, \XMM5 1289 vpxor \T1, \XMM6, \XMM6 1290 vpxor \T1, \XMM7, \XMM7 1291 vpxor \T1, \XMM8, \XMM8 1292 1293 ####################################################################### 1294 1295 1296 1297 1298 1299 vmovdqu 16*1(arg1), \T1 1300 vaesenc \T1, \XMM1, \XMM1 1301 vaesenc \T1, \XMM2, \XMM2 1302 vaesenc \T1, \XMM3, \XMM3 1303 vaesenc \T1, \XMM4, \XMM4 1304 vaesenc \T1, \XMM5, \XMM5 1305 vaesenc \T1, \XMM6, \XMM6 1306 vaesenc \T1, \XMM7, \XMM7 1307 vaesenc \T1, \XMM8, \XMM8 1308 1309 vmovdqu 16*2(arg1), \T1 1310 vaesenc \T1, \XMM1, \XMM1 1311 vaesenc \T1, \XMM2, \XMM2 1312 vaesenc \T1, \XMM3, \XMM3 1313 vaesenc \T1, \XMM4, \XMM4 1314 vaesenc \T1, \XMM5, \XMM5 1315 vaesenc \T1, \XMM6, \XMM6 1316 vaesenc \T1, \XMM7, \XMM7 1317 vaesenc \T1, \XMM8, \XMM8 1318 1319 1320 ####################################################################### 1321 1322 vmovdqu HashKey_8(arg2), \T5 1323 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1324 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1325 1326 vpshufd $0b01001110, \T2, \T6 1327 vpxor \T2, \T6, \T6 1328 1329 vmovdqu HashKey_8_k(arg2), \T5 1330 vpclmulqdq $0x00, \T5, \T6, \T6 1331 1332 vmovdqu 16*3(arg1), \T1 1333 vaesenc \T1, \XMM1, \XMM1 1334 vaesenc \T1, \XMM2, \XMM2 1335 vaesenc \T1, \XMM3, \XMM3 1336 vaesenc \T1, \XMM4, \XMM4 1337 vaesenc \T1, \XMM5, \XMM5 1338 vaesenc \T1, \XMM6, \XMM6 1339 vaesenc \T1, \XMM7, \XMM7 1340 vaesenc \T1, \XMM8, \XMM8 1341 1342 vmovdqa TMP2(%rsp), \T1 1343 vmovdqu HashKey_7(arg2), \T5 1344 vpclmulqdq $0x11, \T5, \T1, \T3 1345 vpxor \T3, \T4, \T4 1346 vpclmulqdq $0x00, \T5, \T1, \T3 1347 vpxor \T3, \T7, \T7 1348 1349 vpshufd $0b01001110, \T1, \T3 1350 vpxor \T1, \T3, \T3 1351 vmovdqu HashKey_7_k(arg2), \T5 1352 vpclmulqdq $0x10, \T5, \T3, \T3 1353 vpxor \T3, \T6, \T6 1354 1355 vmovdqu 16*4(arg1), \T1 1356 vaesenc \T1, \XMM1, \XMM1 1357 vaesenc \T1, \XMM2, \XMM2 1358 vaesenc \T1, \XMM3, \XMM3 1359 vaesenc \T1, \XMM4, \XMM4 1360 vaesenc \T1, \XMM5, \XMM5 1361 vaesenc \T1, \XMM6, \XMM6 1362 vaesenc \T1, \XMM7, \XMM7 1363 vaesenc \T1, \XMM8, \XMM8 1364 1365 ####################################################################### 1366 1367 vmovdqa TMP3(%rsp), \T1 1368 vmovdqu HashKey_6(arg2), \T5 1369 vpclmulqdq $0x11, \T5, \T1, \T3 1370 vpxor \T3, \T4, \T4 1371 vpclmulqdq $0x00, \T5, \T1, \T3 1372 vpxor \T3, \T7, \T7 1373 1374 vpshufd $0b01001110, \T1, \T3 1375 vpxor \T1, \T3, \T3 1376 vmovdqu HashKey_6_k(arg2), \T5 1377 vpclmulqdq $0x10, \T5, \T3, \T3 1378 vpxor \T3, \T6, \T6 1379 1380 vmovdqu 16*5(arg1), \T1 1381 vaesenc \T1, \XMM1, \XMM1 1382 vaesenc \T1, \XMM2, \XMM2 1383 vaesenc \T1, \XMM3, \XMM3 1384 vaesenc \T1, \XMM4, \XMM4 1385 vaesenc \T1, \XMM5, \XMM5 1386 vaesenc \T1, \XMM6, \XMM6 1387 vaesenc \T1, \XMM7, \XMM7 1388 vaesenc \T1, \XMM8, \XMM8 1389 1390 vmovdqa TMP4(%rsp), \T1 1391 vmovdqu HashKey_5(arg2), \T5 1392 vpclmulqdq $0x11, \T5, \T1, \T3 1393 vpxor \T3, \T4, \T4 1394 vpclmulqdq $0x00, \T5, \T1, \T3 1395 vpxor \T3, \T7, \T7 1396 1397 vpshufd $0b01001110, \T1, \T3 1398 vpxor \T1, \T3, \T3 1399 vmovdqu HashKey_5_k(arg2), \T5 1400 vpclmulqdq $0x10, \T5, \T3, \T3 1401 vpxor \T3, \T6, \T6 1402 1403 vmovdqu 16*6(arg1), \T1 1404 vaesenc \T1, \XMM1, \XMM1 1405 vaesenc \T1, \XMM2, \XMM2 1406 vaesenc \T1, \XMM3, \XMM3 1407 vaesenc \T1, \XMM4, \XMM4 1408 vaesenc \T1, \XMM5, \XMM5 1409 vaesenc \T1, \XMM6, \XMM6 1410 vaesenc \T1, \XMM7, \XMM7 1411 vaesenc \T1, \XMM8, \XMM8 1412 1413 1414 vmovdqa TMP5(%rsp), \T1 1415 vmovdqu HashKey_4(arg2), \T5 1416 vpclmulqdq $0x11, \T5, \T1, \T3 1417 vpxor \T3, \T4, \T4 1418 vpclmulqdq $0x00, \T5, \T1, \T3 1419 vpxor \T3, \T7, \T7 1420 1421 vpshufd $0b01001110, \T1, \T3 1422 vpxor \T1, \T3, \T3 1423 vmovdqu HashKey_4_k(arg2), \T5 1424 vpclmulqdq $0x10, \T5, \T3, \T3 1425 vpxor \T3, \T6, \T6 1426 1427 vmovdqu 16*7(arg1), \T1 1428 vaesenc \T1, \XMM1, \XMM1 1429 vaesenc \T1, \XMM2, \XMM2 1430 vaesenc \T1, \XMM3, \XMM3 1431 vaesenc \T1, \XMM4, \XMM4 1432 vaesenc \T1, \XMM5, \XMM5 1433 vaesenc \T1, \XMM6, \XMM6 1434 vaesenc \T1, \XMM7, \XMM7 1435 vaesenc \T1, \XMM8, \XMM8 1436 1437 vmovdqa TMP6(%rsp), \T1 1438 vmovdqu HashKey_3(arg2), \T5 1439 vpclmulqdq $0x11, \T5, \T1, \T3 1440 vpxor \T3, \T4, \T4 1441 vpclmulqdq $0x00, \T5, \T1, \T3 1442 vpxor \T3, \T7, \T7 1443 1444 vpshufd $0b01001110, \T1, \T3 1445 vpxor \T1, \T3, \T3 1446 vmovdqu HashKey_3_k(arg2), \T5 1447 vpclmulqdq $0x10, \T5, \T3, \T3 1448 vpxor \T3, \T6, \T6 1449 1450 1451 vmovdqu 16*8(arg1), \T1 1452 vaesenc \T1, \XMM1, \XMM1 1453 vaesenc \T1, \XMM2, \XMM2 1454 vaesenc \T1, \XMM3, \XMM3 1455 vaesenc \T1, \XMM4, \XMM4 1456 vaesenc \T1, \XMM5, \XMM5 1457 vaesenc \T1, \XMM6, \XMM6 1458 vaesenc \T1, \XMM7, \XMM7 1459 vaesenc \T1, \XMM8, \XMM8 1460 1461 vmovdqa TMP7(%rsp), \T1 1462 vmovdqu HashKey_2(arg2), \T5 1463 vpclmulqdq $0x11, \T5, \T1, \T3 1464 vpxor \T3, \T4, \T4 1465 vpclmulqdq $0x00, \T5, \T1, \T3 1466 vpxor \T3, \T7, \T7 1467 1468 vpshufd $0b01001110, \T1, \T3 1469 vpxor \T1, \T3, \T3 1470 vmovdqu HashKey_2_k(arg2), \T5 1471 vpclmulqdq $0x10, \T5, \T3, \T3 1472 vpxor \T3, \T6, \T6 1473 1474 ####################################################################### 1475 1476 vmovdqu 16*9(arg1), \T5 1477 vaesenc \T5, \XMM1, \XMM1 1478 vaesenc \T5, \XMM2, \XMM2 1479 vaesenc \T5, \XMM3, \XMM3 1480 vaesenc \T5, \XMM4, \XMM4 1481 vaesenc \T5, \XMM5, \XMM5 1482 vaesenc \T5, \XMM6, \XMM6 1483 vaesenc \T5, \XMM7, \XMM7 1484 vaesenc \T5, \XMM8, \XMM8 1485 1486 vmovdqa TMP8(%rsp), \T1 1487 vmovdqu HashKey(arg2), \T5 1488 vpclmulqdq $0x11, \T5, \T1, \T3 1489 vpxor \T3, \T4, \T4 1490 vpclmulqdq $0x00, \T5, \T1, \T3 1491 vpxor \T3, \T7, \T7 1492 1493 vpshufd $0b01001110, \T1, \T3 1494 vpxor \T1, \T3, \T3 1495 vmovdqu HashKey_k(arg2), \T5 1496 vpclmulqdq $0x10, \T5, \T3, \T3 1497 vpxor \T3, \T6, \T6 1498 1499 vpxor \T4, \T6, \T6 1500 vpxor \T7, \T6, \T6 1501 1502 vmovdqu 16*10(arg1), \T5 1503 1504 i = 11 1505 setreg 1506.rep (\REP-9) 1507 1508 vaesenc \T5, \XMM1, \XMM1 1509 vaesenc \T5, \XMM2, \XMM2 1510 vaesenc \T5, \XMM3, \XMM3 1511 vaesenc \T5, \XMM4, \XMM4 1512 vaesenc \T5, \XMM5, \XMM5 1513 vaesenc \T5, \XMM6, \XMM6 1514 vaesenc \T5, \XMM7, \XMM7 1515 vaesenc \T5, \XMM8, \XMM8 1516 1517 vmovdqu 16*i(arg1), \T5 1518 i = i + 1 1519 setreg 1520.endr 1521 1522 i = 0 1523 j = 1 1524 setreg 1525.rep 8 1526 vpxor 16*i(arg4, %r11), \T5, \T2 1527 .if \ENC_DEC == ENC 1528 vaesenclast \T2, reg_j, reg_j 1529 .else 1530 vaesenclast \T2, reg_j, \T3 1531 vmovdqu 16*i(arg4, %r11), reg_j 1532 vmovdqu \T3, 16*i(arg3, %r11) 1533 .endif 1534 i = (i+1) 1535 j = (j+1) 1536 setreg 1537.endr 1538 ####################################################################### 1539 1540 1541 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1542 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1543 vpxor \T3, \T7, \T7 1544 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1545 1546 1547 1548 ####################################################################### 1549 #first phase of the reduction 1550 ####################################################################### 1551 vpslld $31, \T7, \T2 # packed right shifting << 31 1552 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1553 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1554 1555 vpxor \T3, \T2, \T2 # xor the shifted versions 1556 vpxor \T4, \T2, \T2 1557 1558 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1559 1560 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1561 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1562 ####################################################################### 1563 .if \ENC_DEC == ENC 1564 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1565 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1566 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1567 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1568 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1569 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1570 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1571 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1572 .endif 1573 1574 ####################################################################### 1575 #second phase of the reduction 1576 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1577 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1578 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1579 vpxor \T3, \T2, \T2 # xor the shifted versions 1580 vpxor \T4, \T2, \T2 1581 1582 vpxor \T1, \T2, \T2 1583 vpxor \T2, \T7, \T7 1584 vpxor \T7, \T6, \T6 # the result is in T6 1585 ####################################################################### 1586 1587 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1588 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1589 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1590 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1591 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1592 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1593 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1594 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1595 1596 1597 vpxor \T6, \XMM1, \XMM1 1598 1599 1600 1601.endm 1602 1603 1604# GHASH the last 4 ciphertext blocks. 1605.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1606 1607 ## Karatsuba Method 1608 1609 1610 vpshufd $0b01001110, \XMM1, \T2 1611 vpxor \XMM1, \T2, \T2 1612 vmovdqu HashKey_8(arg2), \T5 1613 vpclmulqdq $0x11, \T5, \XMM1, \T6 1614 vpclmulqdq $0x00, \T5, \XMM1, \T7 1615 1616 vmovdqu HashKey_8_k(arg2), \T3 1617 vpclmulqdq $0x00, \T3, \T2, \XMM1 1618 1619 ###################### 1620 1621 vpshufd $0b01001110, \XMM2, \T2 1622 vpxor \XMM2, \T2, \T2 1623 vmovdqu HashKey_7(arg2), \T5 1624 vpclmulqdq $0x11, \T5, \XMM2, \T4 1625 vpxor \T4, \T6, \T6 1626 1627 vpclmulqdq $0x00, \T5, \XMM2, \T4 1628 vpxor \T4, \T7, \T7 1629 1630 vmovdqu HashKey_7_k(arg2), \T3 1631 vpclmulqdq $0x00, \T3, \T2, \T2 1632 vpxor \T2, \XMM1, \XMM1 1633 1634 ###################### 1635 1636 vpshufd $0b01001110, \XMM3, \T2 1637 vpxor \XMM3, \T2, \T2 1638 vmovdqu HashKey_6(arg2), \T5 1639 vpclmulqdq $0x11, \T5, \XMM3, \T4 1640 vpxor \T4, \T6, \T6 1641 1642 vpclmulqdq $0x00, \T5, \XMM3, \T4 1643 vpxor \T4, \T7, \T7 1644 1645 vmovdqu HashKey_6_k(arg2), \T3 1646 vpclmulqdq $0x00, \T3, \T2, \T2 1647 vpxor \T2, \XMM1, \XMM1 1648 1649 ###################### 1650 1651 vpshufd $0b01001110, \XMM4, \T2 1652 vpxor \XMM4, \T2, \T2 1653 vmovdqu HashKey_5(arg2), \T5 1654 vpclmulqdq $0x11, \T5, \XMM4, \T4 1655 vpxor \T4, \T6, \T6 1656 1657 vpclmulqdq $0x00, \T5, \XMM4, \T4 1658 vpxor \T4, \T7, \T7 1659 1660 vmovdqu HashKey_5_k(arg2), \T3 1661 vpclmulqdq $0x00, \T3, \T2, \T2 1662 vpxor \T2, \XMM1, \XMM1 1663 1664 ###################### 1665 1666 vpshufd $0b01001110, \XMM5, \T2 1667 vpxor \XMM5, \T2, \T2 1668 vmovdqu HashKey_4(arg2), \T5 1669 vpclmulqdq $0x11, \T5, \XMM5, \T4 1670 vpxor \T4, \T6, \T6 1671 1672 vpclmulqdq $0x00, \T5, \XMM5, \T4 1673 vpxor \T4, \T7, \T7 1674 1675 vmovdqu HashKey_4_k(arg2), \T3 1676 vpclmulqdq $0x00, \T3, \T2, \T2 1677 vpxor \T2, \XMM1, \XMM1 1678 1679 ###################### 1680 1681 vpshufd $0b01001110, \XMM6, \T2 1682 vpxor \XMM6, \T2, \T2 1683 vmovdqu HashKey_3(arg2), \T5 1684 vpclmulqdq $0x11, \T5, \XMM6, \T4 1685 vpxor \T4, \T6, \T6 1686 1687 vpclmulqdq $0x00, \T5, \XMM6, \T4 1688 vpxor \T4, \T7, \T7 1689 1690 vmovdqu HashKey_3_k(arg2), \T3 1691 vpclmulqdq $0x00, \T3, \T2, \T2 1692 vpxor \T2, \XMM1, \XMM1 1693 1694 ###################### 1695 1696 vpshufd $0b01001110, \XMM7, \T2 1697 vpxor \XMM7, \T2, \T2 1698 vmovdqu HashKey_2(arg2), \T5 1699 vpclmulqdq $0x11, \T5, \XMM7, \T4 1700 vpxor \T4, \T6, \T6 1701 1702 vpclmulqdq $0x00, \T5, \XMM7, \T4 1703 vpxor \T4, \T7, \T7 1704 1705 vmovdqu HashKey_2_k(arg2), \T3 1706 vpclmulqdq $0x00, \T3, \T2, \T2 1707 vpxor \T2, \XMM1, \XMM1 1708 1709 ###################### 1710 1711 vpshufd $0b01001110, \XMM8, \T2 1712 vpxor \XMM8, \T2, \T2 1713 vmovdqu HashKey(arg2), \T5 1714 vpclmulqdq $0x11, \T5, \XMM8, \T4 1715 vpxor \T4, \T6, \T6 1716 1717 vpclmulqdq $0x00, \T5, \XMM8, \T4 1718 vpxor \T4, \T7, \T7 1719 1720 vmovdqu HashKey_k(arg2), \T3 1721 vpclmulqdq $0x00, \T3, \T2, \T2 1722 1723 vpxor \T2, \XMM1, \XMM1 1724 vpxor \T6, \XMM1, \XMM1 1725 vpxor \T7, \XMM1, \T2 1726 1727 1728 1729 1730 vpslldq $8, \T2, \T4 1731 vpsrldq $8, \T2, \T2 1732 1733 vpxor \T4, \T7, \T7 1734 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1735 # the accumulated carry-less multiplications 1736 1737 ####################################################################### 1738 #first phase of the reduction 1739 vpslld $31, \T7, \T2 # packed right shifting << 31 1740 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1741 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1742 1743 vpxor \T3, \T2, \T2 # xor the shifted versions 1744 vpxor \T4, \T2, \T2 1745 1746 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1747 1748 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1749 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1750 ####################################################################### 1751 1752 1753 #second phase of the reduction 1754 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1755 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1756 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1757 vpxor \T3, \T2, \T2 # xor the shifted versions 1758 vpxor \T4, \T2, \T2 1759 1760 vpxor \T1, \T2, \T2 1761 vpxor \T2, \T7, \T7 1762 vpxor \T7, \T6, \T6 # the result is in T6 1763 1764.endm 1765 1766############################################################# 1767#void aesni_gcm_precomp_avx_gen2 1768# (gcm_data *my_ctx_data, 1769# gcm_context_data *data, 1770# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1771# u8 *iv, /* Pre-counter block j0: 4 byte salt 1772# (from Security Association) concatenated with 8 byte 1773# Initialisation Vector (from IPSec ESP Payload) 1774# concatenated with 0x00000001. 16-byte aligned pointer. */ 1775# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1776# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1777############################################################# 1778ENTRY(aesni_gcm_init_avx_gen2) 1779 FUNC_SAVE 1780 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1781 FUNC_RESTORE 1782 ret 1783ENDPROC(aesni_gcm_init_avx_gen2) 1784 1785############################################################################### 1786#void aesni_gcm_enc_update_avx_gen2( 1787# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1788# gcm_context_data *data, 1789# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1790# const u8 *in, /* Plaintext input */ 1791# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1792############################################################################### 1793ENTRY(aesni_gcm_enc_update_avx_gen2) 1794 FUNC_SAVE 1795 mov keysize, %eax 1796 cmp $32, %eax 1797 je key_256_enc_update 1798 cmp $16, %eax 1799 je key_128_enc_update 1800 # must be 192 1801 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1802 FUNC_RESTORE 1803 ret 1804key_128_enc_update: 1805 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1806 FUNC_RESTORE 1807 ret 1808key_256_enc_update: 1809 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1810 FUNC_RESTORE 1811 ret 1812ENDPROC(aesni_gcm_enc_update_avx_gen2) 1813 1814############################################################################### 1815#void aesni_gcm_dec_update_avx_gen2( 1816# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1817# gcm_context_data *data, 1818# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1819# const u8 *in, /* Ciphertext input */ 1820# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1821############################################################################### 1822ENTRY(aesni_gcm_dec_update_avx_gen2) 1823 FUNC_SAVE 1824 mov keysize,%eax 1825 cmp $32, %eax 1826 je key_256_dec_update 1827 cmp $16, %eax 1828 je key_128_dec_update 1829 # must be 192 1830 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1831 FUNC_RESTORE 1832 ret 1833key_128_dec_update: 1834 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1835 FUNC_RESTORE 1836 ret 1837key_256_dec_update: 1838 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1839 FUNC_RESTORE 1840 ret 1841ENDPROC(aesni_gcm_dec_update_avx_gen2) 1842 1843############################################################################### 1844#void aesni_gcm_finalize_avx_gen2( 1845# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1846# gcm_context_data *data, 1847# u8 *auth_tag, /* Authenticated Tag output. */ 1848# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1849# Valid values are 16 (most likely), 12 or 8. */ 1850############################################################################### 1851ENTRY(aesni_gcm_finalize_avx_gen2) 1852 FUNC_SAVE 1853 mov keysize,%eax 1854 cmp $32, %eax 1855 je key_256_finalize 1856 cmp $16, %eax 1857 je key_128_finalize 1858 # must be 192 1859 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1860 FUNC_RESTORE 1861 ret 1862key_128_finalize: 1863 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1864 FUNC_RESTORE 1865 ret 1866key_256_finalize: 1867 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1868 FUNC_RESTORE 1869 ret 1870ENDPROC(aesni_gcm_finalize_avx_gen2) 1871 1872#endif /* CONFIG_AS_AVX */ 1873 1874#ifdef CONFIG_AS_AVX2 1875############################################################################### 1876# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1877# Input: A and B (128-bits each, bit-reflected) 1878# Output: C = A*B*x mod poly, (i.e. >>1 ) 1879# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1880# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1881############################################################################### 1882.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1883 1884 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1885 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1886 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1887 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1888 vpxor \T3, \GH, \GH 1889 1890 1891 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1892 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1893 1894 vpxor \T3, \T1, \T1 1895 vpxor \T2, \GH, \GH 1896 1897 ####################################################################### 1898 #first phase of the reduction 1899 vmovdqa POLY2(%rip), \T3 1900 1901 vpclmulqdq $0x01, \GH, \T3, \T2 1902 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1903 1904 vpxor \T2, \GH, \GH # first phase of the reduction complete 1905 ####################################################################### 1906 #second phase of the reduction 1907 vpclmulqdq $0x00, \GH, \T3, \T2 1908 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1909 1910 vpclmulqdq $0x10, \GH, \T3, \GH 1911 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1912 1913 vpxor \T2, \GH, \GH # second phase of the reduction complete 1914 ####################################################################### 1915 vpxor \T1, \GH, \GH # the result is in GH 1916 1917 1918.endm 1919 1920.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1921 1922 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1923 vmovdqa \HK, \T5 1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1925 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1926 1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1928 vmovdqu \T5, HashKey_3(arg2) 1929 1930 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1931 vmovdqu \T5, HashKey_4(arg2) 1932 1933 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1934 vmovdqu \T5, HashKey_5(arg2) 1935 1936 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1937 vmovdqu \T5, HashKey_6(arg2) 1938 1939 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1940 vmovdqu \T5, HashKey_7(arg2) 1941 1942 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1943 vmovdqu \T5, HashKey_8(arg2) 1944 1945.endm 1946 1947## if a = number of total plaintext bytes 1948## b = floor(a/16) 1949## num_initial_blocks = b mod 4# 1950## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1951## r10, r11, r12, rax are clobbered 1952## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1953 1954.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1955 i = (8-\num_initial_blocks) 1956 setreg 1957 vmovdqu AadHash(arg2), reg_i 1958 1959 # start AES for num_initial_blocks blocks 1960 vmovdqu CurCount(arg2), \CTR 1961 1962 i = (9-\num_initial_blocks) 1963 setreg 1964.rep \num_initial_blocks 1965 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1966 vmovdqa \CTR, reg_i 1967 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1968 i = (i+1) 1969 setreg 1970.endr 1971 1972 vmovdqa (arg1), \T_key 1973 i = (9-\num_initial_blocks) 1974 setreg 1975.rep \num_initial_blocks 1976 vpxor \T_key, reg_i, reg_i 1977 i = (i+1) 1978 setreg 1979.endr 1980 1981 j = 1 1982 setreg 1983.rep \REP 1984 vmovdqa 16*j(arg1), \T_key 1985 i = (9-\num_initial_blocks) 1986 setreg 1987.rep \num_initial_blocks 1988 vaesenc \T_key, reg_i, reg_i 1989 i = (i+1) 1990 setreg 1991.endr 1992 1993 j = (j+1) 1994 setreg 1995.endr 1996 1997 1998 vmovdqa 16*j(arg1), \T_key 1999 i = (9-\num_initial_blocks) 2000 setreg 2001.rep \num_initial_blocks 2002 vaesenclast \T_key, reg_i, reg_i 2003 i = (i+1) 2004 setreg 2005.endr 2006 2007 i = (9-\num_initial_blocks) 2008 setreg 2009.rep \num_initial_blocks 2010 vmovdqu (arg4, %r11), \T1 2011 vpxor \T1, reg_i, reg_i 2012 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 2013 # num_initial_blocks blocks 2014 add $16, %r11 2015.if \ENC_DEC == DEC 2016 vmovdqa \T1, reg_i 2017.endif 2018 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 2019 i = (i+1) 2020 setreg 2021.endr 2022 2023 2024 i = (8-\num_initial_blocks) 2025 j = (9-\num_initial_blocks) 2026 setreg 2027 2028.rep \num_initial_blocks 2029 vpxor reg_i, reg_j, reg_j 2030 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 2031 i = (i+1) 2032 j = (j+1) 2033 setreg 2034.endr 2035 # XMM8 has the combined result here 2036 2037 vmovdqa \XMM8, TMP1(%rsp) 2038 vmovdqa \XMM8, \T3 2039 2040 cmp $128, %r13 2041 jl _initial_blocks_done\@ # no need for precomputed constants 2042 2043############################################################################### 2044# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2045 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2046 vmovdqa \CTR, \XMM1 2047 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2048 2049 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2050 vmovdqa \CTR, \XMM2 2051 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2052 2053 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2054 vmovdqa \CTR, \XMM3 2055 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2056 2057 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2058 vmovdqa \CTR, \XMM4 2059 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2060 2061 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2062 vmovdqa \CTR, \XMM5 2063 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2064 2065 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2066 vmovdqa \CTR, \XMM6 2067 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2068 2069 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2070 vmovdqa \CTR, \XMM7 2071 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2072 2073 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2074 vmovdqa \CTR, \XMM8 2075 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2076 2077 vmovdqa (arg1), \T_key 2078 vpxor \T_key, \XMM1, \XMM1 2079 vpxor \T_key, \XMM2, \XMM2 2080 vpxor \T_key, \XMM3, \XMM3 2081 vpxor \T_key, \XMM4, \XMM4 2082 vpxor \T_key, \XMM5, \XMM5 2083 vpxor \T_key, \XMM6, \XMM6 2084 vpxor \T_key, \XMM7, \XMM7 2085 vpxor \T_key, \XMM8, \XMM8 2086 2087 i = 1 2088 setreg 2089.rep \REP # do REP rounds 2090 vmovdqa 16*i(arg1), \T_key 2091 vaesenc \T_key, \XMM1, \XMM1 2092 vaesenc \T_key, \XMM2, \XMM2 2093 vaesenc \T_key, \XMM3, \XMM3 2094 vaesenc \T_key, \XMM4, \XMM4 2095 vaesenc \T_key, \XMM5, \XMM5 2096 vaesenc \T_key, \XMM6, \XMM6 2097 vaesenc \T_key, \XMM7, \XMM7 2098 vaesenc \T_key, \XMM8, \XMM8 2099 i = (i+1) 2100 setreg 2101.endr 2102 2103 2104 vmovdqa 16*i(arg1), \T_key 2105 vaesenclast \T_key, \XMM1, \XMM1 2106 vaesenclast \T_key, \XMM2, \XMM2 2107 vaesenclast \T_key, \XMM3, \XMM3 2108 vaesenclast \T_key, \XMM4, \XMM4 2109 vaesenclast \T_key, \XMM5, \XMM5 2110 vaesenclast \T_key, \XMM6, \XMM6 2111 vaesenclast \T_key, \XMM7, \XMM7 2112 vaesenclast \T_key, \XMM8, \XMM8 2113 2114 vmovdqu (arg4, %r11), \T1 2115 vpxor \T1, \XMM1, \XMM1 2116 vmovdqu \XMM1, (arg3 , %r11) 2117 .if \ENC_DEC == DEC 2118 vmovdqa \T1, \XMM1 2119 .endif 2120 2121 vmovdqu 16*1(arg4, %r11), \T1 2122 vpxor \T1, \XMM2, \XMM2 2123 vmovdqu \XMM2, 16*1(arg3 , %r11) 2124 .if \ENC_DEC == DEC 2125 vmovdqa \T1, \XMM2 2126 .endif 2127 2128 vmovdqu 16*2(arg4, %r11), \T1 2129 vpxor \T1, \XMM3, \XMM3 2130 vmovdqu \XMM3, 16*2(arg3 , %r11) 2131 .if \ENC_DEC == DEC 2132 vmovdqa \T1, \XMM3 2133 .endif 2134 2135 vmovdqu 16*3(arg4, %r11), \T1 2136 vpxor \T1, \XMM4, \XMM4 2137 vmovdqu \XMM4, 16*3(arg3 , %r11) 2138 .if \ENC_DEC == DEC 2139 vmovdqa \T1, \XMM4 2140 .endif 2141 2142 vmovdqu 16*4(arg4, %r11), \T1 2143 vpxor \T1, \XMM5, \XMM5 2144 vmovdqu \XMM5, 16*4(arg3 , %r11) 2145 .if \ENC_DEC == DEC 2146 vmovdqa \T1, \XMM5 2147 .endif 2148 2149 vmovdqu 16*5(arg4, %r11), \T1 2150 vpxor \T1, \XMM6, \XMM6 2151 vmovdqu \XMM6, 16*5(arg3 , %r11) 2152 .if \ENC_DEC == DEC 2153 vmovdqa \T1, \XMM6 2154 .endif 2155 2156 vmovdqu 16*6(arg4, %r11), \T1 2157 vpxor \T1, \XMM7, \XMM7 2158 vmovdqu \XMM7, 16*6(arg3 , %r11) 2159 .if \ENC_DEC == DEC 2160 vmovdqa \T1, \XMM7 2161 .endif 2162 2163 vmovdqu 16*7(arg4, %r11), \T1 2164 vpxor \T1, \XMM8, \XMM8 2165 vmovdqu \XMM8, 16*7(arg3 , %r11) 2166 .if \ENC_DEC == DEC 2167 vmovdqa \T1, \XMM8 2168 .endif 2169 2170 add $128, %r11 2171 2172 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2173 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2174 # the corresponding ciphertext 2175 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2176 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2177 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2178 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2179 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2180 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2181 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2182 2183############################################################################### 2184 2185_initial_blocks_done\@: 2186 2187 2188.endm 2189 2190 2191 2192# encrypt 8 blocks at a time 2193# ghash the 8 previously encrypted ciphertext blocks 2194# arg1, arg3, arg4 are used as pointers only, not modified 2195# r11 is the data offset value 2196.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2197 2198 vmovdqa \XMM1, \T2 2199 vmovdqa \XMM2, TMP2(%rsp) 2200 vmovdqa \XMM3, TMP3(%rsp) 2201 vmovdqa \XMM4, TMP4(%rsp) 2202 vmovdqa \XMM5, TMP5(%rsp) 2203 vmovdqa \XMM6, TMP6(%rsp) 2204 vmovdqa \XMM7, TMP7(%rsp) 2205 vmovdqa \XMM8, TMP8(%rsp) 2206 2207.if \loop_idx == in_order 2208 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2209 vpaddd ONE(%rip), \XMM1, \XMM2 2210 vpaddd ONE(%rip), \XMM2, \XMM3 2211 vpaddd ONE(%rip), \XMM3, \XMM4 2212 vpaddd ONE(%rip), \XMM4, \XMM5 2213 vpaddd ONE(%rip), \XMM5, \XMM6 2214 vpaddd ONE(%rip), \XMM6, \XMM7 2215 vpaddd ONE(%rip), \XMM7, \XMM8 2216 vmovdqa \XMM8, \CTR 2217 2218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2226.else 2227 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2228 vpaddd ONEf(%rip), \XMM1, \XMM2 2229 vpaddd ONEf(%rip), \XMM2, \XMM3 2230 vpaddd ONEf(%rip), \XMM3, \XMM4 2231 vpaddd ONEf(%rip), \XMM4, \XMM5 2232 vpaddd ONEf(%rip), \XMM5, \XMM6 2233 vpaddd ONEf(%rip), \XMM6, \XMM7 2234 vpaddd ONEf(%rip), \XMM7, \XMM8 2235 vmovdqa \XMM8, \CTR 2236.endif 2237 2238 2239 ####################################################################### 2240 2241 vmovdqu (arg1), \T1 2242 vpxor \T1, \XMM1, \XMM1 2243 vpxor \T1, \XMM2, \XMM2 2244 vpxor \T1, \XMM3, \XMM3 2245 vpxor \T1, \XMM4, \XMM4 2246 vpxor \T1, \XMM5, \XMM5 2247 vpxor \T1, \XMM6, \XMM6 2248 vpxor \T1, \XMM7, \XMM7 2249 vpxor \T1, \XMM8, \XMM8 2250 2251 ####################################################################### 2252 2253 2254 2255 2256 2257 vmovdqu 16*1(arg1), \T1 2258 vaesenc \T1, \XMM1, \XMM1 2259 vaesenc \T1, \XMM2, \XMM2 2260 vaesenc \T1, \XMM3, \XMM3 2261 vaesenc \T1, \XMM4, \XMM4 2262 vaesenc \T1, \XMM5, \XMM5 2263 vaesenc \T1, \XMM6, \XMM6 2264 vaesenc \T1, \XMM7, \XMM7 2265 vaesenc \T1, \XMM8, \XMM8 2266 2267 vmovdqu 16*2(arg1), \T1 2268 vaesenc \T1, \XMM1, \XMM1 2269 vaesenc \T1, \XMM2, \XMM2 2270 vaesenc \T1, \XMM3, \XMM3 2271 vaesenc \T1, \XMM4, \XMM4 2272 vaesenc \T1, \XMM5, \XMM5 2273 vaesenc \T1, \XMM6, \XMM6 2274 vaesenc \T1, \XMM7, \XMM7 2275 vaesenc \T1, \XMM8, \XMM8 2276 2277 2278 ####################################################################### 2279 2280 vmovdqu HashKey_8(arg2), \T5 2281 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2282 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2283 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2284 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2285 vpxor \T5, \T6, \T6 2286 2287 vmovdqu 16*3(arg1), \T1 2288 vaesenc \T1, \XMM1, \XMM1 2289 vaesenc \T1, \XMM2, \XMM2 2290 vaesenc \T1, \XMM3, \XMM3 2291 vaesenc \T1, \XMM4, \XMM4 2292 vaesenc \T1, \XMM5, \XMM5 2293 vaesenc \T1, \XMM6, \XMM6 2294 vaesenc \T1, \XMM7, \XMM7 2295 vaesenc \T1, \XMM8, \XMM8 2296 2297 vmovdqa TMP2(%rsp), \T1 2298 vmovdqu HashKey_7(arg2), \T5 2299 vpclmulqdq $0x11, \T5, \T1, \T3 2300 vpxor \T3, \T4, \T4 2301 2302 vpclmulqdq $0x00, \T5, \T1, \T3 2303 vpxor \T3, \T7, \T7 2304 2305 vpclmulqdq $0x01, \T5, \T1, \T3 2306 vpxor \T3, \T6, \T6 2307 2308 vpclmulqdq $0x10, \T5, \T1, \T3 2309 vpxor \T3, \T6, \T6 2310 2311 vmovdqu 16*4(arg1), \T1 2312 vaesenc \T1, \XMM1, \XMM1 2313 vaesenc \T1, \XMM2, \XMM2 2314 vaesenc \T1, \XMM3, \XMM3 2315 vaesenc \T1, \XMM4, \XMM4 2316 vaesenc \T1, \XMM5, \XMM5 2317 vaesenc \T1, \XMM6, \XMM6 2318 vaesenc \T1, \XMM7, \XMM7 2319 vaesenc \T1, \XMM8, \XMM8 2320 2321 ####################################################################### 2322 2323 vmovdqa TMP3(%rsp), \T1 2324 vmovdqu HashKey_6(arg2), \T5 2325 vpclmulqdq $0x11, \T5, \T1, \T3 2326 vpxor \T3, \T4, \T4 2327 2328 vpclmulqdq $0x00, \T5, \T1, \T3 2329 vpxor \T3, \T7, \T7 2330 2331 vpclmulqdq $0x01, \T5, \T1, \T3 2332 vpxor \T3, \T6, \T6 2333 2334 vpclmulqdq $0x10, \T5, \T1, \T3 2335 vpxor \T3, \T6, \T6 2336 2337 vmovdqu 16*5(arg1), \T1 2338 vaesenc \T1, \XMM1, \XMM1 2339 vaesenc \T1, \XMM2, \XMM2 2340 vaesenc \T1, \XMM3, \XMM3 2341 vaesenc \T1, \XMM4, \XMM4 2342 vaesenc \T1, \XMM5, \XMM5 2343 vaesenc \T1, \XMM6, \XMM6 2344 vaesenc \T1, \XMM7, \XMM7 2345 vaesenc \T1, \XMM8, \XMM8 2346 2347 vmovdqa TMP4(%rsp), \T1 2348 vmovdqu HashKey_5(arg2), \T5 2349 vpclmulqdq $0x11, \T5, \T1, \T3 2350 vpxor \T3, \T4, \T4 2351 2352 vpclmulqdq $0x00, \T5, \T1, \T3 2353 vpxor \T3, \T7, \T7 2354 2355 vpclmulqdq $0x01, \T5, \T1, \T3 2356 vpxor \T3, \T6, \T6 2357 2358 vpclmulqdq $0x10, \T5, \T1, \T3 2359 vpxor \T3, \T6, \T6 2360 2361 vmovdqu 16*6(arg1), \T1 2362 vaesenc \T1, \XMM1, \XMM1 2363 vaesenc \T1, \XMM2, \XMM2 2364 vaesenc \T1, \XMM3, \XMM3 2365 vaesenc \T1, \XMM4, \XMM4 2366 vaesenc \T1, \XMM5, \XMM5 2367 vaesenc \T1, \XMM6, \XMM6 2368 vaesenc \T1, \XMM7, \XMM7 2369 vaesenc \T1, \XMM8, \XMM8 2370 2371 2372 vmovdqa TMP5(%rsp), \T1 2373 vmovdqu HashKey_4(arg2), \T5 2374 vpclmulqdq $0x11, \T5, \T1, \T3 2375 vpxor \T3, \T4, \T4 2376 2377 vpclmulqdq $0x00, \T5, \T1, \T3 2378 vpxor \T3, \T7, \T7 2379 2380 vpclmulqdq $0x01, \T5, \T1, \T3 2381 vpxor \T3, \T6, \T6 2382 2383 vpclmulqdq $0x10, \T5, \T1, \T3 2384 vpxor \T3, \T6, \T6 2385 2386 vmovdqu 16*7(arg1), \T1 2387 vaesenc \T1, \XMM1, \XMM1 2388 vaesenc \T1, \XMM2, \XMM2 2389 vaesenc \T1, \XMM3, \XMM3 2390 vaesenc \T1, \XMM4, \XMM4 2391 vaesenc \T1, \XMM5, \XMM5 2392 vaesenc \T1, \XMM6, \XMM6 2393 vaesenc \T1, \XMM7, \XMM7 2394 vaesenc \T1, \XMM8, \XMM8 2395 2396 vmovdqa TMP6(%rsp), \T1 2397 vmovdqu HashKey_3(arg2), \T5 2398 vpclmulqdq $0x11, \T5, \T1, \T3 2399 vpxor \T3, \T4, \T4 2400 2401 vpclmulqdq $0x00, \T5, \T1, \T3 2402 vpxor \T3, \T7, \T7 2403 2404 vpclmulqdq $0x01, \T5, \T1, \T3 2405 vpxor \T3, \T6, \T6 2406 2407 vpclmulqdq $0x10, \T5, \T1, \T3 2408 vpxor \T3, \T6, \T6 2409 2410 vmovdqu 16*8(arg1), \T1 2411 vaesenc \T1, \XMM1, \XMM1 2412 vaesenc \T1, \XMM2, \XMM2 2413 vaesenc \T1, \XMM3, \XMM3 2414 vaesenc \T1, \XMM4, \XMM4 2415 vaesenc \T1, \XMM5, \XMM5 2416 vaesenc \T1, \XMM6, \XMM6 2417 vaesenc \T1, \XMM7, \XMM7 2418 vaesenc \T1, \XMM8, \XMM8 2419 2420 vmovdqa TMP7(%rsp), \T1 2421 vmovdqu HashKey_2(arg2), \T5 2422 vpclmulqdq $0x11, \T5, \T1, \T3 2423 vpxor \T3, \T4, \T4 2424 2425 vpclmulqdq $0x00, \T5, \T1, \T3 2426 vpxor \T3, \T7, \T7 2427 2428 vpclmulqdq $0x01, \T5, \T1, \T3 2429 vpxor \T3, \T6, \T6 2430 2431 vpclmulqdq $0x10, \T5, \T1, \T3 2432 vpxor \T3, \T6, \T6 2433 2434 2435 ####################################################################### 2436 2437 vmovdqu 16*9(arg1), \T5 2438 vaesenc \T5, \XMM1, \XMM1 2439 vaesenc \T5, \XMM2, \XMM2 2440 vaesenc \T5, \XMM3, \XMM3 2441 vaesenc \T5, \XMM4, \XMM4 2442 vaesenc \T5, \XMM5, \XMM5 2443 vaesenc \T5, \XMM6, \XMM6 2444 vaesenc \T5, \XMM7, \XMM7 2445 vaesenc \T5, \XMM8, \XMM8 2446 2447 vmovdqa TMP8(%rsp), \T1 2448 vmovdqu HashKey(arg2), \T5 2449 2450 vpclmulqdq $0x00, \T5, \T1, \T3 2451 vpxor \T3, \T7, \T7 2452 2453 vpclmulqdq $0x01, \T5, \T1, \T3 2454 vpxor \T3, \T6, \T6 2455 2456 vpclmulqdq $0x10, \T5, \T1, \T3 2457 vpxor \T3, \T6, \T6 2458 2459 vpclmulqdq $0x11, \T5, \T1, \T3 2460 vpxor \T3, \T4, \T1 2461 2462 2463 vmovdqu 16*10(arg1), \T5 2464 2465 i = 11 2466 setreg 2467.rep (\REP-9) 2468 vaesenc \T5, \XMM1, \XMM1 2469 vaesenc \T5, \XMM2, \XMM2 2470 vaesenc \T5, \XMM3, \XMM3 2471 vaesenc \T5, \XMM4, \XMM4 2472 vaesenc \T5, \XMM5, \XMM5 2473 vaesenc \T5, \XMM6, \XMM6 2474 vaesenc \T5, \XMM7, \XMM7 2475 vaesenc \T5, \XMM8, \XMM8 2476 2477 vmovdqu 16*i(arg1), \T5 2478 i = i + 1 2479 setreg 2480.endr 2481 2482 i = 0 2483 j = 1 2484 setreg 2485.rep 8 2486 vpxor 16*i(arg4, %r11), \T5, \T2 2487 .if \ENC_DEC == ENC 2488 vaesenclast \T2, reg_j, reg_j 2489 .else 2490 vaesenclast \T2, reg_j, \T3 2491 vmovdqu 16*i(arg4, %r11), reg_j 2492 vmovdqu \T3, 16*i(arg3, %r11) 2493 .endif 2494 i = (i+1) 2495 j = (j+1) 2496 setreg 2497.endr 2498 ####################################################################### 2499 2500 2501 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2502 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2503 vpxor \T3, \T7, \T7 2504 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2505 2506 2507 2508 ####################################################################### 2509 #first phase of the reduction 2510 vmovdqa POLY2(%rip), \T3 2511 2512 vpclmulqdq $0x01, \T7, \T3, \T2 2513 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2514 2515 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2516 ####################################################################### 2517 .if \ENC_DEC == ENC 2518 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2519 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2520 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2521 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2522 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2523 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2524 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2525 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2526 .endif 2527 2528 ####################################################################### 2529 #second phase of the reduction 2530 vpclmulqdq $0x00, \T7, \T3, \T2 2531 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2532 2533 vpclmulqdq $0x10, \T7, \T3, \T4 2534 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2535 2536 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2537 ####################################################################### 2538 vpxor \T4, \T1, \T1 # the result is in T1 2539 2540 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2541 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2542 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2543 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2544 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2545 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2546 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2547 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2548 2549 2550 vpxor \T1, \XMM1, \XMM1 2551 2552 2553 2554.endm 2555 2556 2557# GHASH the last 4 ciphertext blocks. 2558.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2559 2560 ## Karatsuba Method 2561 2562 vmovdqu HashKey_8(arg2), \T5 2563 2564 vpshufd $0b01001110, \XMM1, \T2 2565 vpshufd $0b01001110, \T5, \T3 2566 vpxor \XMM1, \T2, \T2 2567 vpxor \T5, \T3, \T3 2568 2569 vpclmulqdq $0x11, \T5, \XMM1, \T6 2570 vpclmulqdq $0x00, \T5, \XMM1, \T7 2571 2572 vpclmulqdq $0x00, \T3, \T2, \XMM1 2573 2574 ###################### 2575 2576 vmovdqu HashKey_7(arg2), \T5 2577 vpshufd $0b01001110, \XMM2, \T2 2578 vpshufd $0b01001110, \T5, \T3 2579 vpxor \XMM2, \T2, \T2 2580 vpxor \T5, \T3, \T3 2581 2582 vpclmulqdq $0x11, \T5, \XMM2, \T4 2583 vpxor \T4, \T6, \T6 2584 2585 vpclmulqdq $0x00, \T5, \XMM2, \T4 2586 vpxor \T4, \T7, \T7 2587 2588 vpclmulqdq $0x00, \T3, \T2, \T2 2589 2590 vpxor \T2, \XMM1, \XMM1 2591 2592 ###################### 2593 2594 vmovdqu HashKey_6(arg2), \T5 2595 vpshufd $0b01001110, \XMM3, \T2 2596 vpshufd $0b01001110, \T5, \T3 2597 vpxor \XMM3, \T2, \T2 2598 vpxor \T5, \T3, \T3 2599 2600 vpclmulqdq $0x11, \T5, \XMM3, \T4 2601 vpxor \T4, \T6, \T6 2602 2603 vpclmulqdq $0x00, \T5, \XMM3, \T4 2604 vpxor \T4, \T7, \T7 2605 2606 vpclmulqdq $0x00, \T3, \T2, \T2 2607 2608 vpxor \T2, \XMM1, \XMM1 2609 2610 ###################### 2611 2612 vmovdqu HashKey_5(arg2), \T5 2613 vpshufd $0b01001110, \XMM4, \T2 2614 vpshufd $0b01001110, \T5, \T3 2615 vpxor \XMM4, \T2, \T2 2616 vpxor \T5, \T3, \T3 2617 2618 vpclmulqdq $0x11, \T5, \XMM4, \T4 2619 vpxor \T4, \T6, \T6 2620 2621 vpclmulqdq $0x00, \T5, \XMM4, \T4 2622 vpxor \T4, \T7, \T7 2623 2624 vpclmulqdq $0x00, \T3, \T2, \T2 2625 2626 vpxor \T2, \XMM1, \XMM1 2627 2628 ###################### 2629 2630 vmovdqu HashKey_4(arg2), \T5 2631 vpshufd $0b01001110, \XMM5, \T2 2632 vpshufd $0b01001110, \T5, \T3 2633 vpxor \XMM5, \T2, \T2 2634 vpxor \T5, \T3, \T3 2635 2636 vpclmulqdq $0x11, \T5, \XMM5, \T4 2637 vpxor \T4, \T6, \T6 2638 2639 vpclmulqdq $0x00, \T5, \XMM5, \T4 2640 vpxor \T4, \T7, \T7 2641 2642 vpclmulqdq $0x00, \T3, \T2, \T2 2643 2644 vpxor \T2, \XMM1, \XMM1 2645 2646 ###################### 2647 2648 vmovdqu HashKey_3(arg2), \T5 2649 vpshufd $0b01001110, \XMM6, \T2 2650 vpshufd $0b01001110, \T5, \T3 2651 vpxor \XMM6, \T2, \T2 2652 vpxor \T5, \T3, \T3 2653 2654 vpclmulqdq $0x11, \T5, \XMM6, \T4 2655 vpxor \T4, \T6, \T6 2656 2657 vpclmulqdq $0x00, \T5, \XMM6, \T4 2658 vpxor \T4, \T7, \T7 2659 2660 vpclmulqdq $0x00, \T3, \T2, \T2 2661 2662 vpxor \T2, \XMM1, \XMM1 2663 2664 ###################### 2665 2666 vmovdqu HashKey_2(arg2), \T5 2667 vpshufd $0b01001110, \XMM7, \T2 2668 vpshufd $0b01001110, \T5, \T3 2669 vpxor \XMM7, \T2, \T2 2670 vpxor \T5, \T3, \T3 2671 2672 vpclmulqdq $0x11, \T5, \XMM7, \T4 2673 vpxor \T4, \T6, \T6 2674 2675 vpclmulqdq $0x00, \T5, \XMM7, \T4 2676 vpxor \T4, \T7, \T7 2677 2678 vpclmulqdq $0x00, \T3, \T2, \T2 2679 2680 vpxor \T2, \XMM1, \XMM1 2681 2682 ###################### 2683 2684 vmovdqu HashKey(arg2), \T5 2685 vpshufd $0b01001110, \XMM8, \T2 2686 vpshufd $0b01001110, \T5, \T3 2687 vpxor \XMM8, \T2, \T2 2688 vpxor \T5, \T3, \T3 2689 2690 vpclmulqdq $0x11, \T5, \XMM8, \T4 2691 vpxor \T4, \T6, \T6 2692 2693 vpclmulqdq $0x00, \T5, \XMM8, \T4 2694 vpxor \T4, \T7, \T7 2695 2696 vpclmulqdq $0x00, \T3, \T2, \T2 2697 2698 vpxor \T2, \XMM1, \XMM1 2699 vpxor \T6, \XMM1, \XMM1 2700 vpxor \T7, \XMM1, \T2 2701 2702 2703 2704 2705 vpslldq $8, \T2, \T4 2706 vpsrldq $8, \T2, \T2 2707 2708 vpxor \T4, \T7, \T7 2709 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2710 # accumulated carry-less multiplications 2711 2712 ####################################################################### 2713 #first phase of the reduction 2714 vmovdqa POLY2(%rip), \T3 2715 2716 vpclmulqdq $0x01, \T7, \T3, \T2 2717 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2718 2719 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2720 ####################################################################### 2721 2722 2723 #second phase of the reduction 2724 vpclmulqdq $0x00, \T7, \T3, \T2 2725 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2726 2727 vpclmulqdq $0x10, \T7, \T3, \T4 2728 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2729 2730 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2731 ####################################################################### 2732 vpxor \T4, \T6, \T6 # the result is in T6 2733.endm 2734 2735 2736 2737############################################################# 2738#void aesni_gcm_init_avx_gen4 2739# (gcm_data *my_ctx_data, 2740# gcm_context_data *data, 2741# u8 *iv, /* Pre-counter block j0: 4 byte salt 2742# (from Security Association) concatenated with 8 byte 2743# Initialisation Vector (from IPSec ESP Payload) 2744# concatenated with 0x00000001. 16-byte aligned pointer. */ 2745# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2746# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2747# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2748############################################################# 2749ENTRY(aesni_gcm_init_avx_gen4) 2750 FUNC_SAVE 2751 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2752 FUNC_RESTORE 2753 ret 2754ENDPROC(aesni_gcm_init_avx_gen4) 2755 2756############################################################################### 2757#void aesni_gcm_enc_avx_gen4( 2758# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2759# gcm_context_data *data, 2760# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2761# const u8 *in, /* Plaintext input */ 2762# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2763############################################################################### 2764ENTRY(aesni_gcm_enc_update_avx_gen4) 2765 FUNC_SAVE 2766 mov keysize,%eax 2767 cmp $32, %eax 2768 je key_256_enc_update4 2769 cmp $16, %eax 2770 je key_128_enc_update4 2771 # must be 192 2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2773 FUNC_RESTORE 2774 ret 2775key_128_enc_update4: 2776 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2777 FUNC_RESTORE 2778 ret 2779key_256_enc_update4: 2780 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2781 FUNC_RESTORE 2782 ret 2783ENDPROC(aesni_gcm_enc_update_avx_gen4) 2784 2785############################################################################### 2786#void aesni_gcm_dec_update_avx_gen4( 2787# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2788# gcm_context_data *data, 2789# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2790# const u8 *in, /* Ciphertext input */ 2791# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2792############################################################################### 2793ENTRY(aesni_gcm_dec_update_avx_gen4) 2794 FUNC_SAVE 2795 mov keysize,%eax 2796 cmp $32, %eax 2797 je key_256_dec_update4 2798 cmp $16, %eax 2799 je key_128_dec_update4 2800 # must be 192 2801 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2802 FUNC_RESTORE 2803 ret 2804key_128_dec_update4: 2805 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2806 FUNC_RESTORE 2807 ret 2808key_256_dec_update4: 2809 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2810 FUNC_RESTORE 2811 ret 2812ENDPROC(aesni_gcm_dec_update_avx_gen4) 2813 2814############################################################################### 2815#void aesni_gcm_finalize_avx_gen4( 2816# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2817# gcm_context_data *data, 2818# u8 *auth_tag, /* Authenticated Tag output. */ 2819# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2820# Valid values are 16 (most likely), 12 or 8. */ 2821############################################################################### 2822ENTRY(aesni_gcm_finalize_avx_gen4) 2823 FUNC_SAVE 2824 mov keysize,%eax 2825 cmp $32, %eax 2826 je key_256_finalize4 2827 cmp $16, %eax 2828 je key_128_finalize4 2829 # must be 192 2830 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2831 FUNC_RESTORE 2832 ret 2833key_128_finalize4: 2834 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2835 FUNC_RESTORE 2836 ret 2837key_256_finalize4: 2838 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2839 FUNC_RESTORE 2840 ret 2841ENDPROC(aesni_gcm_finalize_avx_gen4) 2842 2843#endif /* CONFIG_AS_AVX2 */ 2844