1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123 124# constants in mergeable sections, linker can reorder and merge 125.section .rodata.cst16.POLY, "aM", @progbits, 16 126.align 16 127POLY: .octa 0xC2000000000000000000000000000001 128 129.section .rodata.cst16.POLY2, "aM", @progbits, 16 130.align 16 131POLY2: .octa 0xC20000000000000000000001C2000000 132 133.section .rodata.cst16.TWOONE, "aM", @progbits, 16 134.align 16 135TWOONE: .octa 0x00000001000000000000000000000001 136 137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 138.align 16 139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 140 141.section .rodata.cst16.ONE, "aM", @progbits, 16 142.align 16 143ONE: .octa 0x00000000000000000000000000000001 144 145.section .rodata.cst16.ONEf, "aM", @progbits, 16 146.align 16 147ONEf: .octa 0x01000000000000000000000000000000 148 149# order of these constants should not change. 150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 151.section .rodata, "a", @progbits 152.align 16 153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 154ALL_F: .octa 0xffffffffffffffffffffffffffffffff 155 .octa 0x00000000000000000000000000000000 156 157.text 158 159 160#define AadHash 16*0 161#define AadLen 16*1 162#define InLen (16*1)+8 163#define PBlockEncKey 16*2 164#define OrigIV 16*3 165#define CurCount 16*4 166#define PBlockLen 16*5 167 168HashKey = 16*6 # store HashKey <<1 mod poly here 169HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 170HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 171HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 172HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 173HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 174HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 175HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 176HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 177HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 178HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 179HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 180HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 181HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 182HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 183HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 184 185#define arg1 %rdi 186#define arg2 %rsi 187#define arg3 %rdx 188#define arg4 %rcx 189#define arg5 %r8 190#define arg6 %r9 191#define keysize 2*15*16(arg1) 192 193i = 0 194j = 0 195 196out_order = 0 197in_order = 1 198DEC = 0 199ENC = 1 200 201.macro define_reg r n 202reg_\r = %xmm\n 203.endm 204 205.macro setreg 206.altmacro 207define_reg i %i 208define_reg j %j 209.noaltmacro 210.endm 211 212TMP1 = 16*0 # Temporary storage for AAD 213TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 214TMP3 = 16*2 # Temporary storage for AES State 3 215TMP4 = 16*3 # Temporary storage for AES State 4 216TMP5 = 16*4 # Temporary storage for AES State 5 217TMP6 = 16*5 # Temporary storage for AES State 6 218TMP7 = 16*6 # Temporary storage for AES State 7 219TMP8 = 16*7 # Temporary storage for AES State 8 220 221VARIABLE_OFFSET = 16*8 222 223################################ 224# Utility Macros 225################################ 226 227.macro FUNC_SAVE 228 push %r12 229 push %r13 230 push %r15 231 232 push %rbp 233 mov %rsp, %rbp 234 235 sub $VARIABLE_OFFSET, %rsp 236 and $~63, %rsp # align rsp to 64 bytes 237.endm 238 239.macro FUNC_RESTORE 240 mov %rbp, %rsp 241 pop %rbp 242 243 pop %r15 244 pop %r13 245 pop %r12 246.endm 247 248# Encryption of a single block 249.macro ENCRYPT_SINGLE_BLOCK REP XMM0 250 vpxor (arg1), \XMM0, \XMM0 251 i = 1 252 setreg 253.rep \REP 254 vaesenc 16*i(arg1), \XMM0, \XMM0 255 i = (i+1) 256 setreg 257.endr 258 vaesenclast 16*i(arg1), \XMM0, \XMM0 259.endm 260 261# combined for GCM encrypt and decrypt functions 262# clobbering all xmm registers 263# clobbering r10, r11, r12, r13, r15, rax 264.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 265 vmovdqu AadHash(arg2), %xmm8 266 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 267 add arg5, InLen(arg2) 268 269 # initialize the data pointer offset as zero 270 xor %r11d, %r11d 271 272 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 273 sub %r11, arg5 274 275 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 276 and $-16, %r13 # r13 = r13 - (r13 mod 16) 277 278 mov %r13, %r12 279 shr $4, %r12 280 and $7, %r12 281 jz .L_initial_num_blocks_is_0\@ 282 283 cmp $7, %r12 284 je .L_initial_num_blocks_is_7\@ 285 cmp $6, %r12 286 je .L_initial_num_blocks_is_6\@ 287 cmp $5, %r12 288 je .L_initial_num_blocks_is_5\@ 289 cmp $4, %r12 290 je .L_initial_num_blocks_is_4\@ 291 cmp $3, %r12 292 je .L_initial_num_blocks_is_3\@ 293 cmp $2, %r12 294 je .L_initial_num_blocks_is_2\@ 295 296 jmp .L_initial_num_blocks_is_1\@ 297 298.L_initial_num_blocks_is_7\@: 299 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 300 sub $16*7, %r13 301 jmp .L_initial_blocks_encrypted\@ 302 303.L_initial_num_blocks_is_6\@: 304 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 305 sub $16*6, %r13 306 jmp .L_initial_blocks_encrypted\@ 307 308.L_initial_num_blocks_is_5\@: 309 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 310 sub $16*5, %r13 311 jmp .L_initial_blocks_encrypted\@ 312 313.L_initial_num_blocks_is_4\@: 314 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 315 sub $16*4, %r13 316 jmp .L_initial_blocks_encrypted\@ 317 318.L_initial_num_blocks_is_3\@: 319 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 320 sub $16*3, %r13 321 jmp .L_initial_blocks_encrypted\@ 322 323.L_initial_num_blocks_is_2\@: 324 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 325 sub $16*2, %r13 326 jmp .L_initial_blocks_encrypted\@ 327 328.L_initial_num_blocks_is_1\@: 329 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 330 sub $16*1, %r13 331 jmp .L_initial_blocks_encrypted\@ 332 333.L_initial_num_blocks_is_0\@: 334 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 335 336 337.L_initial_blocks_encrypted\@: 338 test %r13, %r13 339 je .L_zero_cipher_left\@ 340 341 sub $128, %r13 342 je .L_eight_cipher_left\@ 343 344 345 346 347 vmovd %xmm9, %r15d 348 and $255, %r15d 349 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 350 351 352.L_encrypt_by_8_new\@: 353 cmp $(255-8), %r15d 354 jg .L_encrypt_by_8\@ 355 356 357 358 add $8, %r15b 359 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 360 add $128, %r11 361 sub $128, %r13 362 jne .L_encrypt_by_8_new\@ 363 364 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 365 jmp .L_eight_cipher_left\@ 366 367.L_encrypt_by_8\@: 368 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 369 add $8, %r15b 370 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 371 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 372 add $128, %r11 373 sub $128, %r13 374 jne .L_encrypt_by_8_new\@ 375 376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 377 378 379 380 381.L_eight_cipher_left\@: 382 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 383 384 385.L_zero_cipher_left\@: 386 vmovdqu %xmm14, AadHash(arg2) 387 vmovdqu %xmm9, CurCount(arg2) 388 389 # check for 0 length 390 mov arg5, %r13 391 and $15, %r13 # r13 = (arg5 mod 16) 392 393 je .L_multiple_of_16_bytes\@ 394 395 # handle the last <16 Byte block separately 396 397 mov %r13, PBlockLen(arg2) 398 399 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 400 vmovdqu %xmm9, CurCount(arg2) 401 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 402 403 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 404 vmovdqu %xmm9, PBlockEncKey(arg2) 405 406 cmp $16, arg5 407 jge .L_large_enough_update\@ 408 409 lea (arg4,%r11,1), %r10 410 mov %r13, %r12 411 412 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 413 414 lea SHIFT_MASK+16(%rip), %r12 415 sub %r13, %r12 # adjust the shuffle mask pointer to be 416 # able to shift 16-r13 bytes (r13 is the 417 # number of bytes in plaintext mod 16) 418 419 jmp .L_final_ghash_mul\@ 420 421.L_large_enough_update\@: 422 sub $16, %r11 423 add %r13, %r11 424 425 # receive the last <16 Byte block 426 vmovdqu (arg4, %r11, 1), %xmm1 427 428 sub %r13, %r11 429 add $16, %r11 430 431 lea SHIFT_MASK+16(%rip), %r12 432 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 433 # (r13 is the number of bytes in plaintext mod 16) 434 sub %r13, %r12 435 # get the appropriate shuffle mask 436 vmovdqu (%r12), %xmm2 437 # shift right 16-r13 bytes 438 vpshufb %xmm2, %xmm1, %xmm1 439 440.L_final_ghash_mul\@: 441 .if \ENC_DEC == DEC 442 vmovdqa %xmm1, %xmm2 443 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 444 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 445 # mask out top 16-r13 bytes of xmm9 446 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 447 vpand %xmm1, %xmm2, %xmm2 448 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 449 vpxor %xmm2, %xmm14, %xmm14 450 451 vmovdqu %xmm14, AadHash(arg2) 452 .else 453 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 454 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 455 # mask out top 16-r13 bytes of xmm9 456 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 457 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 458 vpxor %xmm9, %xmm14, %xmm14 459 460 vmovdqu %xmm14, AadHash(arg2) 461 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 462 .endif 463 464 465 ############################# 466 # output r13 Bytes 467 vmovq %xmm9, %rax 468 cmp $8, %r13 469 jle .L_less_than_8_bytes_left\@ 470 471 mov %rax, (arg3 , %r11) 472 add $8, %r11 473 vpsrldq $8, %xmm9, %xmm9 474 vmovq %xmm9, %rax 475 sub $8, %r13 476 477.L_less_than_8_bytes_left\@: 478 movb %al, (arg3 , %r11) 479 add $1, %r11 480 shr $8, %rax 481 sub $1, %r13 482 jne .L_less_than_8_bytes_left\@ 483 ############################# 484 485.L_multiple_of_16_bytes\@: 486.endm 487 488 489# GCM_COMPLETE Finishes update of tag of last partial block 490# Output: Authorization Tag (AUTH_TAG) 491# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 492.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 493 vmovdqu AadHash(arg2), %xmm14 494 vmovdqu HashKey(arg2), %xmm13 495 496 mov PBlockLen(arg2), %r12 497 test %r12, %r12 498 je .L_partial_done\@ 499 500 #GHASH computation for the last <16 Byte block 501 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 502 503.L_partial_done\@: 504 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 505 shl $3, %r12 # convert into number of bits 506 vmovd %r12d, %xmm15 # len(A) in xmm15 507 508 mov InLen(arg2), %r12 509 shl $3, %r12 # len(C) in bits (*128) 510 vmovq %r12, %xmm1 511 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 512 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 513 514 vpxor %xmm15, %xmm14, %xmm14 515 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 516 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 517 518 vmovdqu OrigIV(arg2), %xmm9 519 520 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 521 522 vpxor %xmm14, %xmm9, %xmm9 523 524 525 526.L_return_T\@: 527 mov \AUTH_TAG, %r10 # r10 = authTag 528 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 529 530 cmp $16, %r11 531 je .L_T_16\@ 532 533 cmp $8, %r11 534 jl .L_T_4\@ 535 536.L_T_8\@: 537 vmovq %xmm9, %rax 538 mov %rax, (%r10) 539 add $8, %r10 540 sub $8, %r11 541 vpsrldq $8, %xmm9, %xmm9 542 test %r11, %r11 543 je .L_return_T_done\@ 544.L_T_4\@: 545 vmovd %xmm9, %eax 546 mov %eax, (%r10) 547 add $4, %r10 548 sub $4, %r11 549 vpsrldq $4, %xmm9, %xmm9 550 test %r11, %r11 551 je .L_return_T_done\@ 552.L_T_123\@: 553 vmovd %xmm9, %eax 554 cmp $2, %r11 555 jl .L_T_1\@ 556 mov %ax, (%r10) 557 cmp $2, %r11 558 je .L_return_T_done\@ 559 add $2, %r10 560 sar $16, %eax 561.L_T_1\@: 562 mov %al, (%r10) 563 jmp .L_return_T_done\@ 564 565.L_T_16\@: 566 vmovdqu %xmm9, (%r10) 567 568.L_return_T_done\@: 569.endm 570 571.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 572 573 mov \AAD, %r10 # r10 = AAD 574 mov \AADLEN, %r12 # r12 = aadLen 575 576 577 mov %r12, %r11 578 579 vpxor \T8, \T8, \T8 580 vpxor \T7, \T7, \T7 581 cmp $16, %r11 582 jl .L_get_AAD_rest8\@ 583.L_get_AAD_blocks\@: 584 vmovdqu (%r10), \T7 585 vpshufb SHUF_MASK(%rip), \T7, \T7 586 vpxor \T7, \T8, \T8 587 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 588 add $16, %r10 589 sub $16, %r12 590 sub $16, %r11 591 cmp $16, %r11 592 jge .L_get_AAD_blocks\@ 593 vmovdqu \T8, \T7 594 test %r11, %r11 595 je .L_get_AAD_done\@ 596 597 vpxor \T7, \T7, \T7 598 599 /* read the last <16B of AAD. since we have at least 4B of 600 data right after the AAD (the ICV, and maybe some CT), we can 601 read 4B/8B blocks safely, and then get rid of the extra stuff */ 602.L_get_AAD_rest8\@: 603 cmp $4, %r11 604 jle .L_get_AAD_rest4\@ 605 movq (%r10), \T1 606 add $8, %r10 607 sub $8, %r11 608 vpslldq $8, \T1, \T1 609 vpsrldq $8, \T7, \T7 610 vpxor \T1, \T7, \T7 611 jmp .L_get_AAD_rest8\@ 612.L_get_AAD_rest4\@: 613 test %r11, %r11 614 jle .L_get_AAD_rest0\@ 615 mov (%r10), %eax 616 movq %rax, \T1 617 add $4, %r10 618 sub $4, %r11 619 vpslldq $12, \T1, \T1 620 vpsrldq $4, \T7, \T7 621 vpxor \T1, \T7, \T7 622.L_get_AAD_rest0\@: 623 /* finalize: shift out the extra bytes we read, and align 624 left. since pslldq can only shift by an immediate, we use 625 vpshufb and a pair of shuffle masks */ 626 leaq ALL_F(%rip), %r11 627 subq %r12, %r11 628 vmovdqu 16(%r11), \T1 629 andq $~3, %r11 630 vpshufb (%r11), \T7, \T7 631 vpand \T1, \T7, \T7 632.L_get_AAD_rest_final\@: 633 vpshufb SHUF_MASK(%rip), \T7, \T7 634 vpxor \T8, \T7, \T7 635 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 636 637.L_get_AAD_done\@: 638 vmovdqu \T7, AadHash(arg2) 639.endm 640 641.macro INIT GHASH_MUL PRECOMPUTE 642 mov arg6, %r11 643 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 644 xor %r11d, %r11d 645 mov %r11, InLen(arg2) # ctx_data.in_length = 0 646 647 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 648 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 649 mov arg3, %rax 650 movdqu (%rax), %xmm0 651 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 652 653 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 654 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 655 656 vmovdqu (arg4), %xmm6 # xmm6 = HashKey 657 658 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 659 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 660 vmovdqa %xmm6, %xmm2 661 vpsllq $1, %xmm6, %xmm6 662 vpsrlq $63, %xmm2, %xmm2 663 vmovdqa %xmm2, %xmm1 664 vpslldq $8, %xmm2, %xmm2 665 vpsrldq $8, %xmm1, %xmm1 666 vpor %xmm2, %xmm6, %xmm6 667 #reduction 668 vpshufd $0b00100100, %xmm1, %xmm2 669 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 670 vpand POLY(%rip), %xmm2, %xmm2 671 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 672 ####################################################################### 673 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 674 675 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 676 677 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 678.endm 679 680 681# Reads DLEN bytes starting at DPTR and stores in XMMDst 682# where 0 < DLEN < 16 683# Clobbers %rax, DLEN 684.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 685 vpxor \XMMDst, \XMMDst, \XMMDst 686 687 cmp $8, \DLEN 688 jl .L_read_lt8_\@ 689 mov (\DPTR), %rax 690 vpinsrq $0, %rax, \XMMDst, \XMMDst 691 sub $8, \DLEN 692 jz .L_done_read_partial_block_\@ 693 xor %eax, %eax 694.L_read_next_byte_\@: 695 shl $8, %rax 696 mov 7(\DPTR, \DLEN, 1), %al 697 dec \DLEN 698 jnz .L_read_next_byte_\@ 699 vpinsrq $1, %rax, \XMMDst, \XMMDst 700 jmp .L_done_read_partial_block_\@ 701.L_read_lt8_\@: 702 xor %eax, %eax 703.L_read_next_byte_lt8_\@: 704 shl $8, %rax 705 mov -1(\DPTR, \DLEN, 1), %al 706 dec \DLEN 707 jnz .L_read_next_byte_lt8_\@ 708 vpinsrq $0, %rax, \XMMDst, \XMMDst 709.L_done_read_partial_block_\@: 710.endm 711 712# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 713# between update calls. 714# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 715# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 716# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 717.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 718 AAD_HASH ENC_DEC 719 mov PBlockLen(arg2), %r13 720 test %r13, %r13 721 je .L_partial_block_done_\@ # Leave Macro if no partial blocks 722 # Read in input data without over reading 723 cmp $16, \PLAIN_CYPH_LEN 724 jl .L_fewer_than_16_bytes_\@ 725 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 726 jmp .L_data_read_\@ 727 728.L_fewer_than_16_bytes_\@: 729 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 730 mov \PLAIN_CYPH_LEN, %r12 731 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 732 733 mov PBlockLen(arg2), %r13 734 735.L_data_read_\@: # Finished reading in data 736 737 vmovdqu PBlockEncKey(arg2), %xmm9 738 vmovdqu HashKey(arg2), %xmm13 739 740 lea SHIFT_MASK(%rip), %r12 741 742 # adjust the shuffle mask pointer to be able to shift r13 bytes 743 # r16-r13 is the number of bytes in plaintext mod 16) 744 add %r13, %r12 745 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 746 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 747 748.if \ENC_DEC == DEC 749 vmovdqa %xmm1, %xmm3 750 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 751 752 mov \PLAIN_CYPH_LEN, %r10 753 add %r13, %r10 754 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 755 sub $16, %r10 756 # Determine if if partial block is not being filled and 757 # shift mask accordingly 758 jge .L_no_extra_mask_1_\@ 759 sub %r10, %r12 760.L_no_extra_mask_1_\@: 761 762 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 763 # get the appropriate mask to mask out bottom r13 bytes of xmm9 764 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 765 766 vpand %xmm1, %xmm3, %xmm3 767 vmovdqa SHUF_MASK(%rip), %xmm10 768 vpshufb %xmm10, %xmm3, %xmm3 769 vpshufb %xmm2, %xmm3, %xmm3 770 vpxor %xmm3, \AAD_HASH, \AAD_HASH 771 772 test %r10, %r10 773 jl .L_partial_incomplete_1_\@ 774 775 # GHASH computation for the last <16 Byte block 776 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 777 xor %eax,%eax 778 779 mov %rax, PBlockLen(arg2) 780 jmp .L_dec_done_\@ 781.L_partial_incomplete_1_\@: 782 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 783.L_dec_done_\@: 784 vmovdqu \AAD_HASH, AadHash(arg2) 785.else 786 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 787 788 mov \PLAIN_CYPH_LEN, %r10 789 add %r13, %r10 790 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 791 sub $16, %r10 792 # Determine if if partial block is not being filled and 793 # shift mask accordingly 794 jge .L_no_extra_mask_2_\@ 795 sub %r10, %r12 796.L_no_extra_mask_2_\@: 797 798 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 799 # get the appropriate mask to mask out bottom r13 bytes of xmm9 800 vpand %xmm1, %xmm9, %xmm9 801 802 vmovdqa SHUF_MASK(%rip), %xmm1 803 vpshufb %xmm1, %xmm9, %xmm9 804 vpshufb %xmm2, %xmm9, %xmm9 805 vpxor %xmm9, \AAD_HASH, \AAD_HASH 806 807 test %r10, %r10 808 jl .L_partial_incomplete_2_\@ 809 810 # GHASH computation for the last <16 Byte block 811 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 812 xor %eax,%eax 813 814 mov %rax, PBlockLen(arg2) 815 jmp .L_encode_done_\@ 816.L_partial_incomplete_2_\@: 817 add \PLAIN_CYPH_LEN, PBlockLen(arg2) 818.L_encode_done_\@: 819 vmovdqu \AAD_HASH, AadHash(arg2) 820 821 vmovdqa SHUF_MASK(%rip), %xmm10 822 # shuffle xmm9 back to output as ciphertext 823 vpshufb %xmm10, %xmm9, %xmm9 824 vpshufb %xmm2, %xmm9, %xmm9 825.endif 826 # output encrypted Bytes 827 test %r10, %r10 828 jl .L_partial_fill_\@ 829 mov %r13, %r12 830 mov $16, %r13 831 # Set r13 to be the number of bytes to write out 832 sub %r12, %r13 833 jmp .L_count_set_\@ 834.L_partial_fill_\@: 835 mov \PLAIN_CYPH_LEN, %r13 836.L_count_set_\@: 837 vmovdqa %xmm9, %xmm0 838 vmovq %xmm0, %rax 839 cmp $8, %r13 840 jle .L_less_than_8_bytes_left_\@ 841 842 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 843 add $8, \DATA_OFFSET 844 psrldq $8, %xmm0 845 vmovq %xmm0, %rax 846 sub $8, %r13 847.L_less_than_8_bytes_left_\@: 848 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 849 add $1, \DATA_OFFSET 850 shr $8, %rax 851 sub $1, %r13 852 jne .L_less_than_8_bytes_left_\@ 853.L_partial_block_done_\@: 854.endm # PARTIAL_BLOCK 855 856############################################################################### 857# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 858# Input: A and B (128-bits each, bit-reflected) 859# Output: C = A*B*x mod poly, (i.e. >>1 ) 860# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 861# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 862############################################################################### 863.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 864 865 vpshufd $0b01001110, \GH, \T2 866 vpshufd $0b01001110, \HK, \T3 867 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 868 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 869 870 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 871 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 872 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 873 vpxor \GH, \T2,\T2 874 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 875 876 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 877 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 878 vpxor \T3, \GH, \GH 879 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 880 881 #first phase of the reduction 882 vpslld $31, \GH, \T2 # packed right shifting << 31 883 vpslld $30, \GH, \T3 # packed right shifting shift << 30 884 vpslld $25, \GH, \T4 # packed right shifting shift << 25 885 886 vpxor \T3, \T2, \T2 # xor the shifted versions 887 vpxor \T4, \T2, \T2 888 889 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 890 891 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 892 vpxor \T2, \GH, \GH # first phase of the reduction complete 893 894 #second phase of the reduction 895 896 vpsrld $1,\GH, \T2 # packed left shifting >> 1 897 vpsrld $2,\GH, \T3 # packed left shifting >> 2 898 vpsrld $7,\GH, \T4 # packed left shifting >> 7 899 vpxor \T3, \T2, \T2 # xor the shifted versions 900 vpxor \T4, \T2, \T2 901 902 vpxor \T5, \T2, \T2 903 vpxor \T2, \GH, \GH 904 vpxor \T1, \GH, \GH # the result is in GH 905 906 907.endm 908 909.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 910 911 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 912 vmovdqa \HK, \T5 913 914 vpshufd $0b01001110, \T5, \T1 915 vpxor \T5, \T1, \T1 916 vmovdqu \T1, HashKey_k(arg2) 917 918 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 919 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 920 vpshufd $0b01001110, \T5, \T1 921 vpxor \T5, \T1, \T1 922 vmovdqu \T1, HashKey_2_k(arg2) 923 924 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 925 vmovdqu \T5, HashKey_3(arg2) 926 vpshufd $0b01001110, \T5, \T1 927 vpxor \T5, \T1, \T1 928 vmovdqu \T1, HashKey_3_k(arg2) 929 930 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 931 vmovdqu \T5, HashKey_4(arg2) 932 vpshufd $0b01001110, \T5, \T1 933 vpxor \T5, \T1, \T1 934 vmovdqu \T1, HashKey_4_k(arg2) 935 936 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 937 vmovdqu \T5, HashKey_5(arg2) 938 vpshufd $0b01001110, \T5, \T1 939 vpxor \T5, \T1, \T1 940 vmovdqu \T1, HashKey_5_k(arg2) 941 942 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 943 vmovdqu \T5, HashKey_6(arg2) 944 vpshufd $0b01001110, \T5, \T1 945 vpxor \T5, \T1, \T1 946 vmovdqu \T1, HashKey_6_k(arg2) 947 948 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 949 vmovdqu \T5, HashKey_7(arg2) 950 vpshufd $0b01001110, \T5, \T1 951 vpxor \T5, \T1, \T1 952 vmovdqu \T1, HashKey_7_k(arg2) 953 954 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 955 vmovdqu \T5, HashKey_8(arg2) 956 vpshufd $0b01001110, \T5, \T1 957 vpxor \T5, \T1, \T1 958 vmovdqu \T1, HashKey_8_k(arg2) 959 960.endm 961 962## if a = number of total plaintext bytes 963## b = floor(a/16) 964## num_initial_blocks = b mod 4# 965## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 966## r10, r11, r12, rax are clobbered 967## arg1, arg2, arg3, arg4 are used as pointers only, not modified 968 969.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 970 i = (8-\num_initial_blocks) 971 setreg 972 vmovdqu AadHash(arg2), reg_i 973 974 # start AES for num_initial_blocks blocks 975 vmovdqu CurCount(arg2), \CTR 976 977 i = (9-\num_initial_blocks) 978 setreg 979.rep \num_initial_blocks 980 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 981 vmovdqa \CTR, reg_i 982 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 983 i = (i+1) 984 setreg 985.endr 986 987 vmovdqa (arg1), \T_key 988 i = (9-\num_initial_blocks) 989 setreg 990.rep \num_initial_blocks 991 vpxor \T_key, reg_i, reg_i 992 i = (i+1) 993 setreg 994.endr 995 996 j = 1 997 setreg 998.rep \REP 999 vmovdqa 16*j(arg1), \T_key 1000 i = (9-\num_initial_blocks) 1001 setreg 1002.rep \num_initial_blocks 1003 vaesenc \T_key, reg_i, reg_i 1004 i = (i+1) 1005 setreg 1006.endr 1007 1008 j = (j+1) 1009 setreg 1010.endr 1011 1012 vmovdqa 16*j(arg1), \T_key 1013 i = (9-\num_initial_blocks) 1014 setreg 1015.rep \num_initial_blocks 1016 vaesenclast \T_key, reg_i, reg_i 1017 i = (i+1) 1018 setreg 1019.endr 1020 1021 i = (9-\num_initial_blocks) 1022 setreg 1023.rep \num_initial_blocks 1024 vmovdqu (arg4, %r11), \T1 1025 vpxor \T1, reg_i, reg_i 1026 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1027 add $16, %r11 1028.if \ENC_DEC == DEC 1029 vmovdqa \T1, reg_i 1030.endif 1031 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1032 i = (i+1) 1033 setreg 1034.endr 1035 1036 1037 i = (8-\num_initial_blocks) 1038 j = (9-\num_initial_blocks) 1039 setreg 1040 1041.rep \num_initial_blocks 1042 vpxor reg_i, reg_j, reg_j 1043 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1044 i = (i+1) 1045 j = (j+1) 1046 setreg 1047.endr 1048 # XMM8 has the combined result here 1049 1050 vmovdqa \XMM8, TMP1(%rsp) 1051 vmovdqa \XMM8, \T3 1052 1053 cmp $128, %r13 1054 jl .L_initial_blocks_done\@ # no need for precomputed constants 1055 1056############################################################################### 1057# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1059 vmovdqa \CTR, \XMM1 1060 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1061 1062 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1063 vmovdqa \CTR, \XMM2 1064 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1065 1066 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1067 vmovdqa \CTR, \XMM3 1068 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1069 1070 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1071 vmovdqa \CTR, \XMM4 1072 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1073 1074 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1075 vmovdqa \CTR, \XMM5 1076 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1077 1078 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1079 vmovdqa \CTR, \XMM6 1080 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1081 1082 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1083 vmovdqa \CTR, \XMM7 1084 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1085 1086 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1087 vmovdqa \CTR, \XMM8 1088 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1089 1090 vmovdqa (arg1), \T_key 1091 vpxor \T_key, \XMM1, \XMM1 1092 vpxor \T_key, \XMM2, \XMM2 1093 vpxor \T_key, \XMM3, \XMM3 1094 vpxor \T_key, \XMM4, \XMM4 1095 vpxor \T_key, \XMM5, \XMM5 1096 vpxor \T_key, \XMM6, \XMM6 1097 vpxor \T_key, \XMM7, \XMM7 1098 vpxor \T_key, \XMM8, \XMM8 1099 1100 i = 1 1101 setreg 1102.rep \REP # do REP rounds 1103 vmovdqa 16*i(arg1), \T_key 1104 vaesenc \T_key, \XMM1, \XMM1 1105 vaesenc \T_key, \XMM2, \XMM2 1106 vaesenc \T_key, \XMM3, \XMM3 1107 vaesenc \T_key, \XMM4, \XMM4 1108 vaesenc \T_key, \XMM5, \XMM5 1109 vaesenc \T_key, \XMM6, \XMM6 1110 vaesenc \T_key, \XMM7, \XMM7 1111 vaesenc \T_key, \XMM8, \XMM8 1112 i = (i+1) 1113 setreg 1114.endr 1115 1116 vmovdqa 16*i(arg1), \T_key 1117 vaesenclast \T_key, \XMM1, \XMM1 1118 vaesenclast \T_key, \XMM2, \XMM2 1119 vaesenclast \T_key, \XMM3, \XMM3 1120 vaesenclast \T_key, \XMM4, \XMM4 1121 vaesenclast \T_key, \XMM5, \XMM5 1122 vaesenclast \T_key, \XMM6, \XMM6 1123 vaesenclast \T_key, \XMM7, \XMM7 1124 vaesenclast \T_key, \XMM8, \XMM8 1125 1126 vmovdqu (arg4, %r11), \T1 1127 vpxor \T1, \XMM1, \XMM1 1128 vmovdqu \XMM1, (arg3 , %r11) 1129 .if \ENC_DEC == DEC 1130 vmovdqa \T1, \XMM1 1131 .endif 1132 1133 vmovdqu 16*1(arg4, %r11), \T1 1134 vpxor \T1, \XMM2, \XMM2 1135 vmovdqu \XMM2, 16*1(arg3 , %r11) 1136 .if \ENC_DEC == DEC 1137 vmovdqa \T1, \XMM2 1138 .endif 1139 1140 vmovdqu 16*2(arg4, %r11), \T1 1141 vpxor \T1, \XMM3, \XMM3 1142 vmovdqu \XMM3, 16*2(arg3 , %r11) 1143 .if \ENC_DEC == DEC 1144 vmovdqa \T1, \XMM3 1145 .endif 1146 1147 vmovdqu 16*3(arg4, %r11), \T1 1148 vpxor \T1, \XMM4, \XMM4 1149 vmovdqu \XMM4, 16*3(arg3 , %r11) 1150 .if \ENC_DEC == DEC 1151 vmovdqa \T1, \XMM4 1152 .endif 1153 1154 vmovdqu 16*4(arg4, %r11), \T1 1155 vpxor \T1, \XMM5, \XMM5 1156 vmovdqu \XMM5, 16*4(arg3 , %r11) 1157 .if \ENC_DEC == DEC 1158 vmovdqa \T1, \XMM5 1159 .endif 1160 1161 vmovdqu 16*5(arg4, %r11), \T1 1162 vpxor \T1, \XMM6, \XMM6 1163 vmovdqu \XMM6, 16*5(arg3 , %r11) 1164 .if \ENC_DEC == DEC 1165 vmovdqa \T1, \XMM6 1166 .endif 1167 1168 vmovdqu 16*6(arg4, %r11), \T1 1169 vpxor \T1, \XMM7, \XMM7 1170 vmovdqu \XMM7, 16*6(arg3 , %r11) 1171 .if \ENC_DEC == DEC 1172 vmovdqa \T1, \XMM7 1173 .endif 1174 1175 vmovdqu 16*7(arg4, %r11), \T1 1176 vpxor \T1, \XMM8, \XMM8 1177 vmovdqu \XMM8, 16*7(arg3 , %r11) 1178 .if \ENC_DEC == DEC 1179 vmovdqa \T1, \XMM8 1180 .endif 1181 1182 add $128, %r11 1183 1184 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1185 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1186 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1187 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1188 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1189 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1190 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1191 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1192 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1193 1194############################################################################### 1195 1196.L_initial_blocks_done\@: 1197 1198.endm 1199 1200# encrypt 8 blocks at a time 1201# ghash the 8 previously encrypted ciphertext blocks 1202# arg1, arg2, arg3, arg4 are used as pointers only, not modified 1203# r11 is the data offset value 1204.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1205 1206 vmovdqa \XMM1, \T2 1207 vmovdqa \XMM2, TMP2(%rsp) 1208 vmovdqa \XMM3, TMP3(%rsp) 1209 vmovdqa \XMM4, TMP4(%rsp) 1210 vmovdqa \XMM5, TMP5(%rsp) 1211 vmovdqa \XMM6, TMP6(%rsp) 1212 vmovdqa \XMM7, TMP7(%rsp) 1213 vmovdqa \XMM8, TMP8(%rsp) 1214 1215.if \loop_idx == in_order 1216 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1217 vpaddd ONE(%rip), \XMM1, \XMM2 1218 vpaddd ONE(%rip), \XMM2, \XMM3 1219 vpaddd ONE(%rip), \XMM3, \XMM4 1220 vpaddd ONE(%rip), \XMM4, \XMM5 1221 vpaddd ONE(%rip), \XMM5, \XMM6 1222 vpaddd ONE(%rip), \XMM6, \XMM7 1223 vpaddd ONE(%rip), \XMM7, \XMM8 1224 vmovdqa \XMM8, \CTR 1225 1226 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1227 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1228 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1229 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1230 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1231 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1232 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1233 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1234.else 1235 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1236 vpaddd ONEf(%rip), \XMM1, \XMM2 1237 vpaddd ONEf(%rip), \XMM2, \XMM3 1238 vpaddd ONEf(%rip), \XMM3, \XMM4 1239 vpaddd ONEf(%rip), \XMM4, \XMM5 1240 vpaddd ONEf(%rip), \XMM5, \XMM6 1241 vpaddd ONEf(%rip), \XMM6, \XMM7 1242 vpaddd ONEf(%rip), \XMM7, \XMM8 1243 vmovdqa \XMM8, \CTR 1244.endif 1245 1246 1247 ####################################################################### 1248 1249 vmovdqu (arg1), \T1 1250 vpxor \T1, \XMM1, \XMM1 1251 vpxor \T1, \XMM2, \XMM2 1252 vpxor \T1, \XMM3, \XMM3 1253 vpxor \T1, \XMM4, \XMM4 1254 vpxor \T1, \XMM5, \XMM5 1255 vpxor \T1, \XMM6, \XMM6 1256 vpxor \T1, \XMM7, \XMM7 1257 vpxor \T1, \XMM8, \XMM8 1258 1259 ####################################################################### 1260 1261 1262 1263 1264 1265 vmovdqu 16*1(arg1), \T1 1266 vaesenc \T1, \XMM1, \XMM1 1267 vaesenc \T1, \XMM2, \XMM2 1268 vaesenc \T1, \XMM3, \XMM3 1269 vaesenc \T1, \XMM4, \XMM4 1270 vaesenc \T1, \XMM5, \XMM5 1271 vaesenc \T1, \XMM6, \XMM6 1272 vaesenc \T1, \XMM7, \XMM7 1273 vaesenc \T1, \XMM8, \XMM8 1274 1275 vmovdqu 16*2(arg1), \T1 1276 vaesenc \T1, \XMM1, \XMM1 1277 vaesenc \T1, \XMM2, \XMM2 1278 vaesenc \T1, \XMM3, \XMM3 1279 vaesenc \T1, \XMM4, \XMM4 1280 vaesenc \T1, \XMM5, \XMM5 1281 vaesenc \T1, \XMM6, \XMM6 1282 vaesenc \T1, \XMM7, \XMM7 1283 vaesenc \T1, \XMM8, \XMM8 1284 1285 1286 ####################################################################### 1287 1288 vmovdqu HashKey_8(arg2), \T5 1289 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1290 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1291 1292 vpshufd $0b01001110, \T2, \T6 1293 vpxor \T2, \T6, \T6 1294 1295 vmovdqu HashKey_8_k(arg2), \T5 1296 vpclmulqdq $0x00, \T5, \T6, \T6 1297 1298 vmovdqu 16*3(arg1), \T1 1299 vaesenc \T1, \XMM1, \XMM1 1300 vaesenc \T1, \XMM2, \XMM2 1301 vaesenc \T1, \XMM3, \XMM3 1302 vaesenc \T1, \XMM4, \XMM4 1303 vaesenc \T1, \XMM5, \XMM5 1304 vaesenc \T1, \XMM6, \XMM6 1305 vaesenc \T1, \XMM7, \XMM7 1306 vaesenc \T1, \XMM8, \XMM8 1307 1308 vmovdqa TMP2(%rsp), \T1 1309 vmovdqu HashKey_7(arg2), \T5 1310 vpclmulqdq $0x11, \T5, \T1, \T3 1311 vpxor \T3, \T4, \T4 1312 vpclmulqdq $0x00, \T5, \T1, \T3 1313 vpxor \T3, \T7, \T7 1314 1315 vpshufd $0b01001110, \T1, \T3 1316 vpxor \T1, \T3, \T3 1317 vmovdqu HashKey_7_k(arg2), \T5 1318 vpclmulqdq $0x10, \T5, \T3, \T3 1319 vpxor \T3, \T6, \T6 1320 1321 vmovdqu 16*4(arg1), \T1 1322 vaesenc \T1, \XMM1, \XMM1 1323 vaesenc \T1, \XMM2, \XMM2 1324 vaesenc \T1, \XMM3, \XMM3 1325 vaesenc \T1, \XMM4, \XMM4 1326 vaesenc \T1, \XMM5, \XMM5 1327 vaesenc \T1, \XMM6, \XMM6 1328 vaesenc \T1, \XMM7, \XMM7 1329 vaesenc \T1, \XMM8, \XMM8 1330 1331 ####################################################################### 1332 1333 vmovdqa TMP3(%rsp), \T1 1334 vmovdqu HashKey_6(arg2), \T5 1335 vpclmulqdq $0x11, \T5, \T1, \T3 1336 vpxor \T3, \T4, \T4 1337 vpclmulqdq $0x00, \T5, \T1, \T3 1338 vpxor \T3, \T7, \T7 1339 1340 vpshufd $0b01001110, \T1, \T3 1341 vpxor \T1, \T3, \T3 1342 vmovdqu HashKey_6_k(arg2), \T5 1343 vpclmulqdq $0x10, \T5, \T3, \T3 1344 vpxor \T3, \T6, \T6 1345 1346 vmovdqu 16*5(arg1), \T1 1347 vaesenc \T1, \XMM1, \XMM1 1348 vaesenc \T1, \XMM2, \XMM2 1349 vaesenc \T1, \XMM3, \XMM3 1350 vaesenc \T1, \XMM4, \XMM4 1351 vaesenc \T1, \XMM5, \XMM5 1352 vaesenc \T1, \XMM6, \XMM6 1353 vaesenc \T1, \XMM7, \XMM7 1354 vaesenc \T1, \XMM8, \XMM8 1355 1356 vmovdqa TMP4(%rsp), \T1 1357 vmovdqu HashKey_5(arg2), \T5 1358 vpclmulqdq $0x11, \T5, \T1, \T3 1359 vpxor \T3, \T4, \T4 1360 vpclmulqdq $0x00, \T5, \T1, \T3 1361 vpxor \T3, \T7, \T7 1362 1363 vpshufd $0b01001110, \T1, \T3 1364 vpxor \T1, \T3, \T3 1365 vmovdqu HashKey_5_k(arg2), \T5 1366 vpclmulqdq $0x10, \T5, \T3, \T3 1367 vpxor \T3, \T6, \T6 1368 1369 vmovdqu 16*6(arg1), \T1 1370 vaesenc \T1, \XMM1, \XMM1 1371 vaesenc \T1, \XMM2, \XMM2 1372 vaesenc \T1, \XMM3, \XMM3 1373 vaesenc \T1, \XMM4, \XMM4 1374 vaesenc \T1, \XMM5, \XMM5 1375 vaesenc \T1, \XMM6, \XMM6 1376 vaesenc \T1, \XMM7, \XMM7 1377 vaesenc \T1, \XMM8, \XMM8 1378 1379 1380 vmovdqa TMP5(%rsp), \T1 1381 vmovdqu HashKey_4(arg2), \T5 1382 vpclmulqdq $0x11, \T5, \T1, \T3 1383 vpxor \T3, \T4, \T4 1384 vpclmulqdq $0x00, \T5, \T1, \T3 1385 vpxor \T3, \T7, \T7 1386 1387 vpshufd $0b01001110, \T1, \T3 1388 vpxor \T1, \T3, \T3 1389 vmovdqu HashKey_4_k(arg2), \T5 1390 vpclmulqdq $0x10, \T5, \T3, \T3 1391 vpxor \T3, \T6, \T6 1392 1393 vmovdqu 16*7(arg1), \T1 1394 vaesenc \T1, \XMM1, \XMM1 1395 vaesenc \T1, \XMM2, \XMM2 1396 vaesenc \T1, \XMM3, \XMM3 1397 vaesenc \T1, \XMM4, \XMM4 1398 vaesenc \T1, \XMM5, \XMM5 1399 vaesenc \T1, \XMM6, \XMM6 1400 vaesenc \T1, \XMM7, \XMM7 1401 vaesenc \T1, \XMM8, \XMM8 1402 1403 vmovdqa TMP6(%rsp), \T1 1404 vmovdqu HashKey_3(arg2), \T5 1405 vpclmulqdq $0x11, \T5, \T1, \T3 1406 vpxor \T3, \T4, \T4 1407 vpclmulqdq $0x00, \T5, \T1, \T3 1408 vpxor \T3, \T7, \T7 1409 1410 vpshufd $0b01001110, \T1, \T3 1411 vpxor \T1, \T3, \T3 1412 vmovdqu HashKey_3_k(arg2), \T5 1413 vpclmulqdq $0x10, \T5, \T3, \T3 1414 vpxor \T3, \T6, \T6 1415 1416 1417 vmovdqu 16*8(arg1), \T1 1418 vaesenc \T1, \XMM1, \XMM1 1419 vaesenc \T1, \XMM2, \XMM2 1420 vaesenc \T1, \XMM3, \XMM3 1421 vaesenc \T1, \XMM4, \XMM4 1422 vaesenc \T1, \XMM5, \XMM5 1423 vaesenc \T1, \XMM6, \XMM6 1424 vaesenc \T1, \XMM7, \XMM7 1425 vaesenc \T1, \XMM8, \XMM8 1426 1427 vmovdqa TMP7(%rsp), \T1 1428 vmovdqu HashKey_2(arg2), \T5 1429 vpclmulqdq $0x11, \T5, \T1, \T3 1430 vpxor \T3, \T4, \T4 1431 vpclmulqdq $0x00, \T5, \T1, \T3 1432 vpxor \T3, \T7, \T7 1433 1434 vpshufd $0b01001110, \T1, \T3 1435 vpxor \T1, \T3, \T3 1436 vmovdqu HashKey_2_k(arg2), \T5 1437 vpclmulqdq $0x10, \T5, \T3, \T3 1438 vpxor \T3, \T6, \T6 1439 1440 ####################################################################### 1441 1442 vmovdqu 16*9(arg1), \T5 1443 vaesenc \T5, \XMM1, \XMM1 1444 vaesenc \T5, \XMM2, \XMM2 1445 vaesenc \T5, \XMM3, \XMM3 1446 vaesenc \T5, \XMM4, \XMM4 1447 vaesenc \T5, \XMM5, \XMM5 1448 vaesenc \T5, \XMM6, \XMM6 1449 vaesenc \T5, \XMM7, \XMM7 1450 vaesenc \T5, \XMM8, \XMM8 1451 1452 vmovdqa TMP8(%rsp), \T1 1453 vmovdqu HashKey(arg2), \T5 1454 vpclmulqdq $0x11, \T5, \T1, \T3 1455 vpxor \T3, \T4, \T4 1456 vpclmulqdq $0x00, \T5, \T1, \T3 1457 vpxor \T3, \T7, \T7 1458 1459 vpshufd $0b01001110, \T1, \T3 1460 vpxor \T1, \T3, \T3 1461 vmovdqu HashKey_k(arg2), \T5 1462 vpclmulqdq $0x10, \T5, \T3, \T3 1463 vpxor \T3, \T6, \T6 1464 1465 vpxor \T4, \T6, \T6 1466 vpxor \T7, \T6, \T6 1467 1468 vmovdqu 16*10(arg1), \T5 1469 1470 i = 11 1471 setreg 1472.rep (\REP-9) 1473 1474 vaesenc \T5, \XMM1, \XMM1 1475 vaesenc \T5, \XMM2, \XMM2 1476 vaesenc \T5, \XMM3, \XMM3 1477 vaesenc \T5, \XMM4, \XMM4 1478 vaesenc \T5, \XMM5, \XMM5 1479 vaesenc \T5, \XMM6, \XMM6 1480 vaesenc \T5, \XMM7, \XMM7 1481 vaesenc \T5, \XMM8, \XMM8 1482 1483 vmovdqu 16*i(arg1), \T5 1484 i = i + 1 1485 setreg 1486.endr 1487 1488 i = 0 1489 j = 1 1490 setreg 1491.rep 8 1492 vpxor 16*i(arg4, %r11), \T5, \T2 1493 .if \ENC_DEC == ENC 1494 vaesenclast \T2, reg_j, reg_j 1495 .else 1496 vaesenclast \T2, reg_j, \T3 1497 vmovdqu 16*i(arg4, %r11), reg_j 1498 vmovdqu \T3, 16*i(arg3, %r11) 1499 .endif 1500 i = (i+1) 1501 j = (j+1) 1502 setreg 1503.endr 1504 ####################################################################### 1505 1506 1507 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1508 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1509 vpxor \T3, \T7, \T7 1510 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1511 1512 1513 1514 ####################################################################### 1515 #first phase of the reduction 1516 ####################################################################### 1517 vpslld $31, \T7, \T2 # packed right shifting << 31 1518 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1519 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1520 1521 vpxor \T3, \T2, \T2 # xor the shifted versions 1522 vpxor \T4, \T2, \T2 1523 1524 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1525 1526 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1527 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1528 ####################################################################### 1529 .if \ENC_DEC == ENC 1530 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1531 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1532 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1533 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1534 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1535 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1536 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1537 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1538 .endif 1539 1540 ####################################################################### 1541 #second phase of the reduction 1542 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1543 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1544 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1545 vpxor \T3, \T2, \T2 # xor the shifted versions 1546 vpxor \T4, \T2, \T2 1547 1548 vpxor \T1, \T2, \T2 1549 vpxor \T2, \T7, \T7 1550 vpxor \T7, \T6, \T6 # the result is in T6 1551 ####################################################################### 1552 1553 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1554 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1555 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1556 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1557 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1558 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1559 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1560 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1561 1562 1563 vpxor \T6, \XMM1, \XMM1 1564 1565 1566 1567.endm 1568 1569 1570# GHASH the last 4 ciphertext blocks. 1571.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1572 1573 ## Karatsuba Method 1574 1575 1576 vpshufd $0b01001110, \XMM1, \T2 1577 vpxor \XMM1, \T2, \T2 1578 vmovdqu HashKey_8(arg2), \T5 1579 vpclmulqdq $0x11, \T5, \XMM1, \T6 1580 vpclmulqdq $0x00, \T5, \XMM1, \T7 1581 1582 vmovdqu HashKey_8_k(arg2), \T3 1583 vpclmulqdq $0x00, \T3, \T2, \XMM1 1584 1585 ###################### 1586 1587 vpshufd $0b01001110, \XMM2, \T2 1588 vpxor \XMM2, \T2, \T2 1589 vmovdqu HashKey_7(arg2), \T5 1590 vpclmulqdq $0x11, \T5, \XMM2, \T4 1591 vpxor \T4, \T6, \T6 1592 1593 vpclmulqdq $0x00, \T5, \XMM2, \T4 1594 vpxor \T4, \T7, \T7 1595 1596 vmovdqu HashKey_7_k(arg2), \T3 1597 vpclmulqdq $0x00, \T3, \T2, \T2 1598 vpxor \T2, \XMM1, \XMM1 1599 1600 ###################### 1601 1602 vpshufd $0b01001110, \XMM3, \T2 1603 vpxor \XMM3, \T2, \T2 1604 vmovdqu HashKey_6(arg2), \T5 1605 vpclmulqdq $0x11, \T5, \XMM3, \T4 1606 vpxor \T4, \T6, \T6 1607 1608 vpclmulqdq $0x00, \T5, \XMM3, \T4 1609 vpxor \T4, \T7, \T7 1610 1611 vmovdqu HashKey_6_k(arg2), \T3 1612 vpclmulqdq $0x00, \T3, \T2, \T2 1613 vpxor \T2, \XMM1, \XMM1 1614 1615 ###################### 1616 1617 vpshufd $0b01001110, \XMM4, \T2 1618 vpxor \XMM4, \T2, \T2 1619 vmovdqu HashKey_5(arg2), \T5 1620 vpclmulqdq $0x11, \T5, \XMM4, \T4 1621 vpxor \T4, \T6, \T6 1622 1623 vpclmulqdq $0x00, \T5, \XMM4, \T4 1624 vpxor \T4, \T7, \T7 1625 1626 vmovdqu HashKey_5_k(arg2), \T3 1627 vpclmulqdq $0x00, \T3, \T2, \T2 1628 vpxor \T2, \XMM1, \XMM1 1629 1630 ###################### 1631 1632 vpshufd $0b01001110, \XMM5, \T2 1633 vpxor \XMM5, \T2, \T2 1634 vmovdqu HashKey_4(arg2), \T5 1635 vpclmulqdq $0x11, \T5, \XMM5, \T4 1636 vpxor \T4, \T6, \T6 1637 1638 vpclmulqdq $0x00, \T5, \XMM5, \T4 1639 vpxor \T4, \T7, \T7 1640 1641 vmovdqu HashKey_4_k(arg2), \T3 1642 vpclmulqdq $0x00, \T3, \T2, \T2 1643 vpxor \T2, \XMM1, \XMM1 1644 1645 ###################### 1646 1647 vpshufd $0b01001110, \XMM6, \T2 1648 vpxor \XMM6, \T2, \T2 1649 vmovdqu HashKey_3(arg2), \T5 1650 vpclmulqdq $0x11, \T5, \XMM6, \T4 1651 vpxor \T4, \T6, \T6 1652 1653 vpclmulqdq $0x00, \T5, \XMM6, \T4 1654 vpxor \T4, \T7, \T7 1655 1656 vmovdqu HashKey_3_k(arg2), \T3 1657 vpclmulqdq $0x00, \T3, \T2, \T2 1658 vpxor \T2, \XMM1, \XMM1 1659 1660 ###################### 1661 1662 vpshufd $0b01001110, \XMM7, \T2 1663 vpxor \XMM7, \T2, \T2 1664 vmovdqu HashKey_2(arg2), \T5 1665 vpclmulqdq $0x11, \T5, \XMM7, \T4 1666 vpxor \T4, \T6, \T6 1667 1668 vpclmulqdq $0x00, \T5, \XMM7, \T4 1669 vpxor \T4, \T7, \T7 1670 1671 vmovdqu HashKey_2_k(arg2), \T3 1672 vpclmulqdq $0x00, \T3, \T2, \T2 1673 vpxor \T2, \XMM1, \XMM1 1674 1675 ###################### 1676 1677 vpshufd $0b01001110, \XMM8, \T2 1678 vpxor \XMM8, \T2, \T2 1679 vmovdqu HashKey(arg2), \T5 1680 vpclmulqdq $0x11, \T5, \XMM8, \T4 1681 vpxor \T4, \T6, \T6 1682 1683 vpclmulqdq $0x00, \T5, \XMM8, \T4 1684 vpxor \T4, \T7, \T7 1685 1686 vmovdqu HashKey_k(arg2), \T3 1687 vpclmulqdq $0x00, \T3, \T2, \T2 1688 1689 vpxor \T2, \XMM1, \XMM1 1690 vpxor \T6, \XMM1, \XMM1 1691 vpxor \T7, \XMM1, \T2 1692 1693 1694 1695 1696 vpslldq $8, \T2, \T4 1697 vpsrldq $8, \T2, \T2 1698 1699 vpxor \T4, \T7, \T7 1700 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1701 # the accumulated carry-less multiplications 1702 1703 ####################################################################### 1704 #first phase of the reduction 1705 vpslld $31, \T7, \T2 # packed right shifting << 31 1706 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1707 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1708 1709 vpxor \T3, \T2, \T2 # xor the shifted versions 1710 vpxor \T4, \T2, \T2 1711 1712 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1713 1714 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1715 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1716 ####################################################################### 1717 1718 1719 #second phase of the reduction 1720 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1721 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1722 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1723 vpxor \T3, \T2, \T2 # xor the shifted versions 1724 vpxor \T4, \T2, \T2 1725 1726 vpxor \T1, \T2, \T2 1727 vpxor \T2, \T7, \T7 1728 vpxor \T7, \T6, \T6 # the result is in T6 1729 1730.endm 1731 1732############################################################# 1733#void aesni_gcm_precomp_avx_gen2 1734# (gcm_data *my_ctx_data, 1735# gcm_context_data *data, 1736# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1737# u8 *iv, /* Pre-counter block j0: 4 byte salt 1738# (from Security Association) concatenated with 8 byte 1739# Initialisation Vector (from IPSec ESP Payload) 1740# concatenated with 0x00000001. 16-byte aligned pointer. */ 1741# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1742# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1743############################################################# 1744SYM_FUNC_START(aesni_gcm_init_avx_gen2) 1745 FUNC_SAVE 1746 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1747 FUNC_RESTORE 1748 RET 1749SYM_FUNC_END(aesni_gcm_init_avx_gen2) 1750 1751############################################################################### 1752#void aesni_gcm_enc_update_avx_gen2( 1753# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1754# gcm_context_data *data, 1755# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1756# const u8 *in, /* Plaintext input */ 1757# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1758############################################################################### 1759SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) 1760 FUNC_SAVE 1761 mov keysize, %eax 1762 cmp $32, %eax 1763 je key_256_enc_update 1764 cmp $16, %eax 1765 je key_128_enc_update 1766 # must be 192 1767 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1768 FUNC_RESTORE 1769 RET 1770key_128_enc_update: 1771 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1772 FUNC_RESTORE 1773 RET 1774key_256_enc_update: 1775 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1776 FUNC_RESTORE 1777 RET 1778SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) 1779 1780############################################################################### 1781#void aesni_gcm_dec_update_avx_gen2( 1782# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1783# gcm_context_data *data, 1784# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1785# const u8 *in, /* Ciphertext input */ 1786# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1787############################################################################### 1788SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) 1789 FUNC_SAVE 1790 mov keysize,%eax 1791 cmp $32, %eax 1792 je key_256_dec_update 1793 cmp $16, %eax 1794 je key_128_dec_update 1795 # must be 192 1796 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1797 FUNC_RESTORE 1798 RET 1799key_128_dec_update: 1800 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1801 FUNC_RESTORE 1802 RET 1803key_256_dec_update: 1804 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1805 FUNC_RESTORE 1806 RET 1807SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) 1808 1809############################################################################### 1810#void aesni_gcm_finalize_avx_gen2( 1811# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1812# gcm_context_data *data, 1813# u8 *auth_tag, /* Authenticated Tag output. */ 1814# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1815# Valid values are 16 (most likely), 12 or 8. */ 1816############################################################################### 1817SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) 1818 FUNC_SAVE 1819 mov keysize,%eax 1820 cmp $32, %eax 1821 je key_256_finalize 1822 cmp $16, %eax 1823 je key_128_finalize 1824 # must be 192 1825 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1826 FUNC_RESTORE 1827 RET 1828key_128_finalize: 1829 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1830 FUNC_RESTORE 1831 RET 1832key_256_finalize: 1833 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1834 FUNC_RESTORE 1835 RET 1836SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) 1837 1838############################################################################### 1839# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1840# Input: A and B (128-bits each, bit-reflected) 1841# Output: C = A*B*x mod poly, (i.e. >>1 ) 1842# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1843# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1844############################################################################### 1845.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1846 1847 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1848 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1849 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1850 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1851 vpxor \T3, \GH, \GH 1852 1853 1854 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1855 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1856 1857 vpxor \T3, \T1, \T1 1858 vpxor \T2, \GH, \GH 1859 1860 ####################################################################### 1861 #first phase of the reduction 1862 vmovdqa POLY2(%rip), \T3 1863 1864 vpclmulqdq $0x01, \GH, \T3, \T2 1865 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1866 1867 vpxor \T2, \GH, \GH # first phase of the reduction complete 1868 ####################################################################### 1869 #second phase of the reduction 1870 vpclmulqdq $0x00, \GH, \T3, \T2 1871 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1872 1873 vpclmulqdq $0x10, \GH, \T3, \GH 1874 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1875 1876 vpxor \T2, \GH, \GH # second phase of the reduction complete 1877 ####################################################################### 1878 vpxor \T1, \GH, \GH # the result is in GH 1879 1880 1881.endm 1882 1883.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1884 1885 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1886 vmovdqa \HK, \T5 1887 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1888 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1889 1890 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1891 vmovdqu \T5, HashKey_3(arg2) 1892 1893 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1894 vmovdqu \T5, HashKey_4(arg2) 1895 1896 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1897 vmovdqu \T5, HashKey_5(arg2) 1898 1899 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1900 vmovdqu \T5, HashKey_6(arg2) 1901 1902 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1903 vmovdqu \T5, HashKey_7(arg2) 1904 1905 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1906 vmovdqu \T5, HashKey_8(arg2) 1907 1908.endm 1909 1910## if a = number of total plaintext bytes 1911## b = floor(a/16) 1912## num_initial_blocks = b mod 4# 1913## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1914## r10, r11, r12, rax are clobbered 1915## arg1, arg2, arg3, arg4 are used as pointers only, not modified 1916 1917.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1918 i = (8-\num_initial_blocks) 1919 setreg 1920 vmovdqu AadHash(arg2), reg_i 1921 1922 # start AES for num_initial_blocks blocks 1923 vmovdqu CurCount(arg2), \CTR 1924 1925 i = (9-\num_initial_blocks) 1926 setreg 1927.rep \num_initial_blocks 1928 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1929 vmovdqa \CTR, reg_i 1930 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1931 i = (i+1) 1932 setreg 1933.endr 1934 1935 vmovdqa (arg1), \T_key 1936 i = (9-\num_initial_blocks) 1937 setreg 1938.rep \num_initial_blocks 1939 vpxor \T_key, reg_i, reg_i 1940 i = (i+1) 1941 setreg 1942.endr 1943 1944 j = 1 1945 setreg 1946.rep \REP 1947 vmovdqa 16*j(arg1), \T_key 1948 i = (9-\num_initial_blocks) 1949 setreg 1950.rep \num_initial_blocks 1951 vaesenc \T_key, reg_i, reg_i 1952 i = (i+1) 1953 setreg 1954.endr 1955 1956 j = (j+1) 1957 setreg 1958.endr 1959 1960 1961 vmovdqa 16*j(arg1), \T_key 1962 i = (9-\num_initial_blocks) 1963 setreg 1964.rep \num_initial_blocks 1965 vaesenclast \T_key, reg_i, reg_i 1966 i = (i+1) 1967 setreg 1968.endr 1969 1970 i = (9-\num_initial_blocks) 1971 setreg 1972.rep \num_initial_blocks 1973 vmovdqu (arg4, %r11), \T1 1974 vpxor \T1, reg_i, reg_i 1975 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 1976 # num_initial_blocks blocks 1977 add $16, %r11 1978.if \ENC_DEC == DEC 1979 vmovdqa \T1, reg_i 1980.endif 1981 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1982 i = (i+1) 1983 setreg 1984.endr 1985 1986 1987 i = (8-\num_initial_blocks) 1988 j = (9-\num_initial_blocks) 1989 setreg 1990 1991.rep \num_initial_blocks 1992 vpxor reg_i, reg_j, reg_j 1993 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1994 i = (i+1) 1995 j = (j+1) 1996 setreg 1997.endr 1998 # XMM8 has the combined result here 1999 2000 vmovdqa \XMM8, TMP1(%rsp) 2001 vmovdqa \XMM8, \T3 2002 2003 cmp $128, %r13 2004 jl .L_initial_blocks_done\@ # no need for precomputed constants 2005 2006############################################################################### 2007# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2008 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2009 vmovdqa \CTR, \XMM1 2010 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2011 2012 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2013 vmovdqa \CTR, \XMM2 2014 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2015 2016 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2017 vmovdqa \CTR, \XMM3 2018 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2019 2020 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2021 vmovdqa \CTR, \XMM4 2022 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2023 2024 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2025 vmovdqa \CTR, \XMM5 2026 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2027 2028 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2029 vmovdqa \CTR, \XMM6 2030 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2031 2032 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2033 vmovdqa \CTR, \XMM7 2034 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2035 2036 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2037 vmovdqa \CTR, \XMM8 2038 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2039 2040 vmovdqa (arg1), \T_key 2041 vpxor \T_key, \XMM1, \XMM1 2042 vpxor \T_key, \XMM2, \XMM2 2043 vpxor \T_key, \XMM3, \XMM3 2044 vpxor \T_key, \XMM4, \XMM4 2045 vpxor \T_key, \XMM5, \XMM5 2046 vpxor \T_key, \XMM6, \XMM6 2047 vpxor \T_key, \XMM7, \XMM7 2048 vpxor \T_key, \XMM8, \XMM8 2049 2050 i = 1 2051 setreg 2052.rep \REP # do REP rounds 2053 vmovdqa 16*i(arg1), \T_key 2054 vaesenc \T_key, \XMM1, \XMM1 2055 vaesenc \T_key, \XMM2, \XMM2 2056 vaesenc \T_key, \XMM3, \XMM3 2057 vaesenc \T_key, \XMM4, \XMM4 2058 vaesenc \T_key, \XMM5, \XMM5 2059 vaesenc \T_key, \XMM6, \XMM6 2060 vaesenc \T_key, \XMM7, \XMM7 2061 vaesenc \T_key, \XMM8, \XMM8 2062 i = (i+1) 2063 setreg 2064.endr 2065 2066 2067 vmovdqa 16*i(arg1), \T_key 2068 vaesenclast \T_key, \XMM1, \XMM1 2069 vaesenclast \T_key, \XMM2, \XMM2 2070 vaesenclast \T_key, \XMM3, \XMM3 2071 vaesenclast \T_key, \XMM4, \XMM4 2072 vaesenclast \T_key, \XMM5, \XMM5 2073 vaesenclast \T_key, \XMM6, \XMM6 2074 vaesenclast \T_key, \XMM7, \XMM7 2075 vaesenclast \T_key, \XMM8, \XMM8 2076 2077 vmovdqu (arg4, %r11), \T1 2078 vpxor \T1, \XMM1, \XMM1 2079 vmovdqu \XMM1, (arg3 , %r11) 2080 .if \ENC_DEC == DEC 2081 vmovdqa \T1, \XMM1 2082 .endif 2083 2084 vmovdqu 16*1(arg4, %r11), \T1 2085 vpxor \T1, \XMM2, \XMM2 2086 vmovdqu \XMM2, 16*1(arg3 , %r11) 2087 .if \ENC_DEC == DEC 2088 vmovdqa \T1, \XMM2 2089 .endif 2090 2091 vmovdqu 16*2(arg4, %r11), \T1 2092 vpxor \T1, \XMM3, \XMM3 2093 vmovdqu \XMM3, 16*2(arg3 , %r11) 2094 .if \ENC_DEC == DEC 2095 vmovdqa \T1, \XMM3 2096 .endif 2097 2098 vmovdqu 16*3(arg4, %r11), \T1 2099 vpxor \T1, \XMM4, \XMM4 2100 vmovdqu \XMM4, 16*3(arg3 , %r11) 2101 .if \ENC_DEC == DEC 2102 vmovdqa \T1, \XMM4 2103 .endif 2104 2105 vmovdqu 16*4(arg4, %r11), \T1 2106 vpxor \T1, \XMM5, \XMM5 2107 vmovdqu \XMM5, 16*4(arg3 , %r11) 2108 .if \ENC_DEC == DEC 2109 vmovdqa \T1, \XMM5 2110 .endif 2111 2112 vmovdqu 16*5(arg4, %r11), \T1 2113 vpxor \T1, \XMM6, \XMM6 2114 vmovdqu \XMM6, 16*5(arg3 , %r11) 2115 .if \ENC_DEC == DEC 2116 vmovdqa \T1, \XMM6 2117 .endif 2118 2119 vmovdqu 16*6(arg4, %r11), \T1 2120 vpxor \T1, \XMM7, \XMM7 2121 vmovdqu \XMM7, 16*6(arg3 , %r11) 2122 .if \ENC_DEC == DEC 2123 vmovdqa \T1, \XMM7 2124 .endif 2125 2126 vmovdqu 16*7(arg4, %r11), \T1 2127 vpxor \T1, \XMM8, \XMM8 2128 vmovdqu \XMM8, 16*7(arg3 , %r11) 2129 .if \ENC_DEC == DEC 2130 vmovdqa \T1, \XMM8 2131 .endif 2132 2133 add $128, %r11 2134 2135 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2136 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2137 # the corresponding ciphertext 2138 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2139 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2140 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2141 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2142 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2143 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2144 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2145 2146############################################################################### 2147 2148.L_initial_blocks_done\@: 2149 2150 2151.endm 2152 2153 2154 2155# encrypt 8 blocks at a time 2156# ghash the 8 previously encrypted ciphertext blocks 2157# arg1, arg2, arg3, arg4 are used as pointers only, not modified 2158# r11 is the data offset value 2159.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2160 2161 vmovdqa \XMM1, \T2 2162 vmovdqa \XMM2, TMP2(%rsp) 2163 vmovdqa \XMM3, TMP3(%rsp) 2164 vmovdqa \XMM4, TMP4(%rsp) 2165 vmovdqa \XMM5, TMP5(%rsp) 2166 vmovdqa \XMM6, TMP6(%rsp) 2167 vmovdqa \XMM7, TMP7(%rsp) 2168 vmovdqa \XMM8, TMP8(%rsp) 2169 2170.if \loop_idx == in_order 2171 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2172 vpaddd ONE(%rip), \XMM1, \XMM2 2173 vpaddd ONE(%rip), \XMM2, \XMM3 2174 vpaddd ONE(%rip), \XMM3, \XMM4 2175 vpaddd ONE(%rip), \XMM4, \XMM5 2176 vpaddd ONE(%rip), \XMM5, \XMM6 2177 vpaddd ONE(%rip), \XMM6, \XMM7 2178 vpaddd ONE(%rip), \XMM7, \XMM8 2179 vmovdqa \XMM8, \CTR 2180 2181 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2182 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2183 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2184 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2185 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2186 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2187 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2188 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2189.else 2190 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2191 vpaddd ONEf(%rip), \XMM1, \XMM2 2192 vpaddd ONEf(%rip), \XMM2, \XMM3 2193 vpaddd ONEf(%rip), \XMM3, \XMM4 2194 vpaddd ONEf(%rip), \XMM4, \XMM5 2195 vpaddd ONEf(%rip), \XMM5, \XMM6 2196 vpaddd ONEf(%rip), \XMM6, \XMM7 2197 vpaddd ONEf(%rip), \XMM7, \XMM8 2198 vmovdqa \XMM8, \CTR 2199.endif 2200 2201 2202 ####################################################################### 2203 2204 vmovdqu (arg1), \T1 2205 vpxor \T1, \XMM1, \XMM1 2206 vpxor \T1, \XMM2, \XMM2 2207 vpxor \T1, \XMM3, \XMM3 2208 vpxor \T1, \XMM4, \XMM4 2209 vpxor \T1, \XMM5, \XMM5 2210 vpxor \T1, \XMM6, \XMM6 2211 vpxor \T1, \XMM7, \XMM7 2212 vpxor \T1, \XMM8, \XMM8 2213 2214 ####################################################################### 2215 2216 2217 2218 2219 2220 vmovdqu 16*1(arg1), \T1 2221 vaesenc \T1, \XMM1, \XMM1 2222 vaesenc \T1, \XMM2, \XMM2 2223 vaesenc \T1, \XMM3, \XMM3 2224 vaesenc \T1, \XMM4, \XMM4 2225 vaesenc \T1, \XMM5, \XMM5 2226 vaesenc \T1, \XMM6, \XMM6 2227 vaesenc \T1, \XMM7, \XMM7 2228 vaesenc \T1, \XMM8, \XMM8 2229 2230 vmovdqu 16*2(arg1), \T1 2231 vaesenc \T1, \XMM1, \XMM1 2232 vaesenc \T1, \XMM2, \XMM2 2233 vaesenc \T1, \XMM3, \XMM3 2234 vaesenc \T1, \XMM4, \XMM4 2235 vaesenc \T1, \XMM5, \XMM5 2236 vaesenc \T1, \XMM6, \XMM6 2237 vaesenc \T1, \XMM7, \XMM7 2238 vaesenc \T1, \XMM8, \XMM8 2239 2240 2241 ####################################################################### 2242 2243 vmovdqu HashKey_8(arg2), \T5 2244 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2245 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2246 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2247 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2248 vpxor \T5, \T6, \T6 2249 2250 vmovdqu 16*3(arg1), \T1 2251 vaesenc \T1, \XMM1, \XMM1 2252 vaesenc \T1, \XMM2, \XMM2 2253 vaesenc \T1, \XMM3, \XMM3 2254 vaesenc \T1, \XMM4, \XMM4 2255 vaesenc \T1, \XMM5, \XMM5 2256 vaesenc \T1, \XMM6, \XMM6 2257 vaesenc \T1, \XMM7, \XMM7 2258 vaesenc \T1, \XMM8, \XMM8 2259 2260 vmovdqa TMP2(%rsp), \T1 2261 vmovdqu HashKey_7(arg2), \T5 2262 vpclmulqdq $0x11, \T5, \T1, \T3 2263 vpxor \T3, \T4, \T4 2264 2265 vpclmulqdq $0x00, \T5, \T1, \T3 2266 vpxor \T3, \T7, \T7 2267 2268 vpclmulqdq $0x01, \T5, \T1, \T3 2269 vpxor \T3, \T6, \T6 2270 2271 vpclmulqdq $0x10, \T5, \T1, \T3 2272 vpxor \T3, \T6, \T6 2273 2274 vmovdqu 16*4(arg1), \T1 2275 vaesenc \T1, \XMM1, \XMM1 2276 vaesenc \T1, \XMM2, \XMM2 2277 vaesenc \T1, \XMM3, \XMM3 2278 vaesenc \T1, \XMM4, \XMM4 2279 vaesenc \T1, \XMM5, \XMM5 2280 vaesenc \T1, \XMM6, \XMM6 2281 vaesenc \T1, \XMM7, \XMM7 2282 vaesenc \T1, \XMM8, \XMM8 2283 2284 ####################################################################### 2285 2286 vmovdqa TMP3(%rsp), \T1 2287 vmovdqu HashKey_6(arg2), \T5 2288 vpclmulqdq $0x11, \T5, \T1, \T3 2289 vpxor \T3, \T4, \T4 2290 2291 vpclmulqdq $0x00, \T5, \T1, \T3 2292 vpxor \T3, \T7, \T7 2293 2294 vpclmulqdq $0x01, \T5, \T1, \T3 2295 vpxor \T3, \T6, \T6 2296 2297 vpclmulqdq $0x10, \T5, \T1, \T3 2298 vpxor \T3, \T6, \T6 2299 2300 vmovdqu 16*5(arg1), \T1 2301 vaesenc \T1, \XMM1, \XMM1 2302 vaesenc \T1, \XMM2, \XMM2 2303 vaesenc \T1, \XMM3, \XMM3 2304 vaesenc \T1, \XMM4, \XMM4 2305 vaesenc \T1, \XMM5, \XMM5 2306 vaesenc \T1, \XMM6, \XMM6 2307 vaesenc \T1, \XMM7, \XMM7 2308 vaesenc \T1, \XMM8, \XMM8 2309 2310 vmovdqa TMP4(%rsp), \T1 2311 vmovdqu HashKey_5(arg2), \T5 2312 vpclmulqdq $0x11, \T5, \T1, \T3 2313 vpxor \T3, \T4, \T4 2314 2315 vpclmulqdq $0x00, \T5, \T1, \T3 2316 vpxor \T3, \T7, \T7 2317 2318 vpclmulqdq $0x01, \T5, \T1, \T3 2319 vpxor \T3, \T6, \T6 2320 2321 vpclmulqdq $0x10, \T5, \T1, \T3 2322 vpxor \T3, \T6, \T6 2323 2324 vmovdqu 16*6(arg1), \T1 2325 vaesenc \T1, \XMM1, \XMM1 2326 vaesenc \T1, \XMM2, \XMM2 2327 vaesenc \T1, \XMM3, \XMM3 2328 vaesenc \T1, \XMM4, \XMM4 2329 vaesenc \T1, \XMM5, \XMM5 2330 vaesenc \T1, \XMM6, \XMM6 2331 vaesenc \T1, \XMM7, \XMM7 2332 vaesenc \T1, \XMM8, \XMM8 2333 2334 2335 vmovdqa TMP5(%rsp), \T1 2336 vmovdqu HashKey_4(arg2), \T5 2337 vpclmulqdq $0x11, \T5, \T1, \T3 2338 vpxor \T3, \T4, \T4 2339 2340 vpclmulqdq $0x00, \T5, \T1, \T3 2341 vpxor \T3, \T7, \T7 2342 2343 vpclmulqdq $0x01, \T5, \T1, \T3 2344 vpxor \T3, \T6, \T6 2345 2346 vpclmulqdq $0x10, \T5, \T1, \T3 2347 vpxor \T3, \T6, \T6 2348 2349 vmovdqu 16*7(arg1), \T1 2350 vaesenc \T1, \XMM1, \XMM1 2351 vaesenc \T1, \XMM2, \XMM2 2352 vaesenc \T1, \XMM3, \XMM3 2353 vaesenc \T1, \XMM4, \XMM4 2354 vaesenc \T1, \XMM5, \XMM5 2355 vaesenc \T1, \XMM6, \XMM6 2356 vaesenc \T1, \XMM7, \XMM7 2357 vaesenc \T1, \XMM8, \XMM8 2358 2359 vmovdqa TMP6(%rsp), \T1 2360 vmovdqu HashKey_3(arg2), \T5 2361 vpclmulqdq $0x11, \T5, \T1, \T3 2362 vpxor \T3, \T4, \T4 2363 2364 vpclmulqdq $0x00, \T5, \T1, \T3 2365 vpxor \T3, \T7, \T7 2366 2367 vpclmulqdq $0x01, \T5, \T1, \T3 2368 vpxor \T3, \T6, \T6 2369 2370 vpclmulqdq $0x10, \T5, \T1, \T3 2371 vpxor \T3, \T6, \T6 2372 2373 vmovdqu 16*8(arg1), \T1 2374 vaesenc \T1, \XMM1, \XMM1 2375 vaesenc \T1, \XMM2, \XMM2 2376 vaesenc \T1, \XMM3, \XMM3 2377 vaesenc \T1, \XMM4, \XMM4 2378 vaesenc \T1, \XMM5, \XMM5 2379 vaesenc \T1, \XMM6, \XMM6 2380 vaesenc \T1, \XMM7, \XMM7 2381 vaesenc \T1, \XMM8, \XMM8 2382 2383 vmovdqa TMP7(%rsp), \T1 2384 vmovdqu HashKey_2(arg2), \T5 2385 vpclmulqdq $0x11, \T5, \T1, \T3 2386 vpxor \T3, \T4, \T4 2387 2388 vpclmulqdq $0x00, \T5, \T1, \T3 2389 vpxor \T3, \T7, \T7 2390 2391 vpclmulqdq $0x01, \T5, \T1, \T3 2392 vpxor \T3, \T6, \T6 2393 2394 vpclmulqdq $0x10, \T5, \T1, \T3 2395 vpxor \T3, \T6, \T6 2396 2397 2398 ####################################################################### 2399 2400 vmovdqu 16*9(arg1), \T5 2401 vaesenc \T5, \XMM1, \XMM1 2402 vaesenc \T5, \XMM2, \XMM2 2403 vaesenc \T5, \XMM3, \XMM3 2404 vaesenc \T5, \XMM4, \XMM4 2405 vaesenc \T5, \XMM5, \XMM5 2406 vaesenc \T5, \XMM6, \XMM6 2407 vaesenc \T5, \XMM7, \XMM7 2408 vaesenc \T5, \XMM8, \XMM8 2409 2410 vmovdqa TMP8(%rsp), \T1 2411 vmovdqu HashKey(arg2), \T5 2412 2413 vpclmulqdq $0x00, \T5, \T1, \T3 2414 vpxor \T3, \T7, \T7 2415 2416 vpclmulqdq $0x01, \T5, \T1, \T3 2417 vpxor \T3, \T6, \T6 2418 2419 vpclmulqdq $0x10, \T5, \T1, \T3 2420 vpxor \T3, \T6, \T6 2421 2422 vpclmulqdq $0x11, \T5, \T1, \T3 2423 vpxor \T3, \T4, \T1 2424 2425 2426 vmovdqu 16*10(arg1), \T5 2427 2428 i = 11 2429 setreg 2430.rep (\REP-9) 2431 vaesenc \T5, \XMM1, \XMM1 2432 vaesenc \T5, \XMM2, \XMM2 2433 vaesenc \T5, \XMM3, \XMM3 2434 vaesenc \T5, \XMM4, \XMM4 2435 vaesenc \T5, \XMM5, \XMM5 2436 vaesenc \T5, \XMM6, \XMM6 2437 vaesenc \T5, \XMM7, \XMM7 2438 vaesenc \T5, \XMM8, \XMM8 2439 2440 vmovdqu 16*i(arg1), \T5 2441 i = i + 1 2442 setreg 2443.endr 2444 2445 i = 0 2446 j = 1 2447 setreg 2448.rep 8 2449 vpxor 16*i(arg4, %r11), \T5, \T2 2450 .if \ENC_DEC == ENC 2451 vaesenclast \T2, reg_j, reg_j 2452 .else 2453 vaesenclast \T2, reg_j, \T3 2454 vmovdqu 16*i(arg4, %r11), reg_j 2455 vmovdqu \T3, 16*i(arg3, %r11) 2456 .endif 2457 i = (i+1) 2458 j = (j+1) 2459 setreg 2460.endr 2461 ####################################################################### 2462 2463 2464 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2465 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2466 vpxor \T3, \T7, \T7 2467 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2468 2469 2470 2471 ####################################################################### 2472 #first phase of the reduction 2473 vmovdqa POLY2(%rip), \T3 2474 2475 vpclmulqdq $0x01, \T7, \T3, \T2 2476 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2477 2478 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2479 ####################################################################### 2480 .if \ENC_DEC == ENC 2481 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2482 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2483 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2484 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2485 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2486 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2487 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2488 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2489 .endif 2490 2491 ####################################################################### 2492 #second phase of the reduction 2493 vpclmulqdq $0x00, \T7, \T3, \T2 2494 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2495 2496 vpclmulqdq $0x10, \T7, \T3, \T4 2497 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2498 2499 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2500 ####################################################################### 2501 vpxor \T4, \T1, \T1 # the result is in T1 2502 2503 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2504 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2505 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2506 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2507 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2509 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2510 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2511 2512 2513 vpxor \T1, \XMM1, \XMM1 2514 2515 2516 2517.endm 2518 2519 2520# GHASH the last 4 ciphertext blocks. 2521.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2522 2523 ## Karatsuba Method 2524 2525 vmovdqu HashKey_8(arg2), \T5 2526 2527 vpshufd $0b01001110, \XMM1, \T2 2528 vpshufd $0b01001110, \T5, \T3 2529 vpxor \XMM1, \T2, \T2 2530 vpxor \T5, \T3, \T3 2531 2532 vpclmulqdq $0x11, \T5, \XMM1, \T6 2533 vpclmulqdq $0x00, \T5, \XMM1, \T7 2534 2535 vpclmulqdq $0x00, \T3, \T2, \XMM1 2536 2537 ###################### 2538 2539 vmovdqu HashKey_7(arg2), \T5 2540 vpshufd $0b01001110, \XMM2, \T2 2541 vpshufd $0b01001110, \T5, \T3 2542 vpxor \XMM2, \T2, \T2 2543 vpxor \T5, \T3, \T3 2544 2545 vpclmulqdq $0x11, \T5, \XMM2, \T4 2546 vpxor \T4, \T6, \T6 2547 2548 vpclmulqdq $0x00, \T5, \XMM2, \T4 2549 vpxor \T4, \T7, \T7 2550 2551 vpclmulqdq $0x00, \T3, \T2, \T2 2552 2553 vpxor \T2, \XMM1, \XMM1 2554 2555 ###################### 2556 2557 vmovdqu HashKey_6(arg2), \T5 2558 vpshufd $0b01001110, \XMM3, \T2 2559 vpshufd $0b01001110, \T5, \T3 2560 vpxor \XMM3, \T2, \T2 2561 vpxor \T5, \T3, \T3 2562 2563 vpclmulqdq $0x11, \T5, \XMM3, \T4 2564 vpxor \T4, \T6, \T6 2565 2566 vpclmulqdq $0x00, \T5, \XMM3, \T4 2567 vpxor \T4, \T7, \T7 2568 2569 vpclmulqdq $0x00, \T3, \T2, \T2 2570 2571 vpxor \T2, \XMM1, \XMM1 2572 2573 ###################### 2574 2575 vmovdqu HashKey_5(arg2), \T5 2576 vpshufd $0b01001110, \XMM4, \T2 2577 vpshufd $0b01001110, \T5, \T3 2578 vpxor \XMM4, \T2, \T2 2579 vpxor \T5, \T3, \T3 2580 2581 vpclmulqdq $0x11, \T5, \XMM4, \T4 2582 vpxor \T4, \T6, \T6 2583 2584 vpclmulqdq $0x00, \T5, \XMM4, \T4 2585 vpxor \T4, \T7, \T7 2586 2587 vpclmulqdq $0x00, \T3, \T2, \T2 2588 2589 vpxor \T2, \XMM1, \XMM1 2590 2591 ###################### 2592 2593 vmovdqu HashKey_4(arg2), \T5 2594 vpshufd $0b01001110, \XMM5, \T2 2595 vpshufd $0b01001110, \T5, \T3 2596 vpxor \XMM5, \T2, \T2 2597 vpxor \T5, \T3, \T3 2598 2599 vpclmulqdq $0x11, \T5, \XMM5, \T4 2600 vpxor \T4, \T6, \T6 2601 2602 vpclmulqdq $0x00, \T5, \XMM5, \T4 2603 vpxor \T4, \T7, \T7 2604 2605 vpclmulqdq $0x00, \T3, \T2, \T2 2606 2607 vpxor \T2, \XMM1, \XMM1 2608 2609 ###################### 2610 2611 vmovdqu HashKey_3(arg2), \T5 2612 vpshufd $0b01001110, \XMM6, \T2 2613 vpshufd $0b01001110, \T5, \T3 2614 vpxor \XMM6, \T2, \T2 2615 vpxor \T5, \T3, \T3 2616 2617 vpclmulqdq $0x11, \T5, \XMM6, \T4 2618 vpxor \T4, \T6, \T6 2619 2620 vpclmulqdq $0x00, \T5, \XMM6, \T4 2621 vpxor \T4, \T7, \T7 2622 2623 vpclmulqdq $0x00, \T3, \T2, \T2 2624 2625 vpxor \T2, \XMM1, \XMM1 2626 2627 ###################### 2628 2629 vmovdqu HashKey_2(arg2), \T5 2630 vpshufd $0b01001110, \XMM7, \T2 2631 vpshufd $0b01001110, \T5, \T3 2632 vpxor \XMM7, \T2, \T2 2633 vpxor \T5, \T3, \T3 2634 2635 vpclmulqdq $0x11, \T5, \XMM7, \T4 2636 vpxor \T4, \T6, \T6 2637 2638 vpclmulqdq $0x00, \T5, \XMM7, \T4 2639 vpxor \T4, \T7, \T7 2640 2641 vpclmulqdq $0x00, \T3, \T2, \T2 2642 2643 vpxor \T2, \XMM1, \XMM1 2644 2645 ###################### 2646 2647 vmovdqu HashKey(arg2), \T5 2648 vpshufd $0b01001110, \XMM8, \T2 2649 vpshufd $0b01001110, \T5, \T3 2650 vpxor \XMM8, \T2, \T2 2651 vpxor \T5, \T3, \T3 2652 2653 vpclmulqdq $0x11, \T5, \XMM8, \T4 2654 vpxor \T4, \T6, \T6 2655 2656 vpclmulqdq $0x00, \T5, \XMM8, \T4 2657 vpxor \T4, \T7, \T7 2658 2659 vpclmulqdq $0x00, \T3, \T2, \T2 2660 2661 vpxor \T2, \XMM1, \XMM1 2662 vpxor \T6, \XMM1, \XMM1 2663 vpxor \T7, \XMM1, \T2 2664 2665 2666 2667 2668 vpslldq $8, \T2, \T4 2669 vpsrldq $8, \T2, \T2 2670 2671 vpxor \T4, \T7, \T7 2672 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2673 # accumulated carry-less multiplications 2674 2675 ####################################################################### 2676 #first phase of the reduction 2677 vmovdqa POLY2(%rip), \T3 2678 2679 vpclmulqdq $0x01, \T7, \T3, \T2 2680 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2681 2682 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2683 ####################################################################### 2684 2685 2686 #second phase of the reduction 2687 vpclmulqdq $0x00, \T7, \T3, \T2 2688 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2689 2690 vpclmulqdq $0x10, \T7, \T3, \T4 2691 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2692 2693 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2694 ####################################################################### 2695 vpxor \T4, \T6, \T6 # the result is in T6 2696.endm 2697 2698 2699 2700############################################################# 2701#void aesni_gcm_init_avx_gen4 2702# (gcm_data *my_ctx_data, 2703# gcm_context_data *data, 2704# u8 *iv, /* Pre-counter block j0: 4 byte salt 2705# (from Security Association) concatenated with 8 byte 2706# Initialisation Vector (from IPSec ESP Payload) 2707# concatenated with 0x00000001. 16-byte aligned pointer. */ 2708# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2709# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2710# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2711############################################################# 2712SYM_FUNC_START(aesni_gcm_init_avx_gen4) 2713 FUNC_SAVE 2714 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2715 FUNC_RESTORE 2716 RET 2717SYM_FUNC_END(aesni_gcm_init_avx_gen4) 2718 2719############################################################################### 2720#void aesni_gcm_enc_avx_gen4( 2721# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2722# gcm_context_data *data, 2723# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2724# const u8 *in, /* Plaintext input */ 2725# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2726############################################################################### 2727SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) 2728 FUNC_SAVE 2729 mov keysize,%eax 2730 cmp $32, %eax 2731 je key_256_enc_update4 2732 cmp $16, %eax 2733 je key_128_enc_update4 2734 # must be 192 2735 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2736 FUNC_RESTORE 2737 RET 2738key_128_enc_update4: 2739 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2740 FUNC_RESTORE 2741 RET 2742key_256_enc_update4: 2743 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2744 FUNC_RESTORE 2745 RET 2746SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) 2747 2748############################################################################### 2749#void aesni_gcm_dec_update_avx_gen4( 2750# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2751# gcm_context_data *data, 2752# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2753# const u8 *in, /* Ciphertext input */ 2754# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2755############################################################################### 2756SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) 2757 FUNC_SAVE 2758 mov keysize,%eax 2759 cmp $32, %eax 2760 je key_256_dec_update4 2761 cmp $16, %eax 2762 je key_128_dec_update4 2763 # must be 192 2764 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2765 FUNC_RESTORE 2766 RET 2767key_128_dec_update4: 2768 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2769 FUNC_RESTORE 2770 RET 2771key_256_dec_update4: 2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2773 FUNC_RESTORE 2774 RET 2775SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) 2776 2777############################################################################### 2778#void aesni_gcm_finalize_avx_gen4( 2779# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2780# gcm_context_data *data, 2781# u8 *auth_tag, /* Authenticated Tag output. */ 2782# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2783# Valid values are 16 (most likely), 12 or 8. */ 2784############################################################################### 2785SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) 2786 FUNC_SAVE 2787 mov keysize,%eax 2788 cmp $32, %eax 2789 je key_256_finalize4 2790 cmp $16, %eax 2791 je key_128_finalize4 2792 # must be 192 2793 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2794 FUNC_RESTORE 2795 RET 2796key_128_finalize4: 2797 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2798 FUNC_RESTORE 2799 RET 2800key_256_finalize4: 2801 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2802 FUNC_RESTORE 2803 RET 2804SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) 2805