1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/frame.h> 35#include <asm/nospec-branch.h> 36 37/* 38 * The following macros are used to move an (un)aligned 16 byte value to/from 39 * an XMM register. This can done for either FP or integer values, for FP use 40 * movaps (move aligned packed single) or integer use movdqa (move double quad 41 * aligned). It doesn't make a performance difference which instruction is used 42 * since Nehalem (original Core i7) was released. However, the movaps is a byte 43 * shorter, so that is the one we'll use for now. (same for unaligned). 44 */ 45#define MOVADQ movaps 46#define MOVUDQ movups 47 48#ifdef __x86_64__ 49 50# constants in mergeable sections, linker can reorder and merge 51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 52.align 16 53.Lgf128mul_x_ble_mask: 54 .octa 0x00000000000000010000000000000087 55.section .rodata.cst16.POLY, "aM", @progbits, 16 56.align 16 57POLY: .octa 0xC2000000000000000000000000000001 58.section .rodata.cst16.TWOONE, "aM", @progbits, 16 59.align 16 60TWOONE: .octa 0x00000001000000000000000000000001 61 62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 63.align 16 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 65.section .rodata.cst16.MASK1, "aM", @progbits, 16 66.align 16 67MASK1: .octa 0x0000000000000000ffffffffffffffff 68.section .rodata.cst16.MASK2, "aM", @progbits, 16 69.align 16 70MASK2: .octa 0xffffffffffffffff0000000000000000 71.section .rodata.cst16.ONE, "aM", @progbits, 16 72.align 16 73ONE: .octa 0x00000000000000000000000000000001 74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 75.align 16 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 77.section .rodata.cst16.dec, "aM", @progbits, 16 78.align 16 79dec: .octa 0x1 80.section .rodata.cst16.enc, "aM", @progbits, 16 81.align 16 82enc: .octa 0x2 83 84# order of these constants should not change. 85# more specifically, ALL_F should follow SHIFT_MASK, 86# and zero should follow ALL_F 87.section .rodata, "a", @progbits 88.align 16 89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 90ALL_F: .octa 0xffffffffffffffffffffffffffffffff 91 .octa 0x00000000000000000000000000000000 92 93.text 94 95 96#define STACK_OFFSET 8*3 97#define HashKey 16*0 // store HashKey <<1 mod poly here 98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 102 // bits of HashKey <<1 mod poly here 103 //(for Karatsuba purposes) 104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 105 // bits of HashKey^2 <<1 mod poly here 106 // (for Karatsuba purposes) 107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 108 // bits of HashKey^3 <<1 mod poly here 109 // (for Karatsuba purposes) 110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 111 // bits of HashKey^4 <<1 mod poly here 112 // (for Karatsuba purposes) 113#define VARIABLE_OFFSET 16*8 114 115#define arg1 rdi 116#define arg2 rsi 117#define arg3 rdx 118#define arg4 rcx 119#define arg5 r8 120#define arg6 r9 121#define arg7 STACK_OFFSET+8(%r14) 122#define arg8 STACK_OFFSET+16(%r14) 123#define arg9 STACK_OFFSET+24(%r14) 124#define arg10 STACK_OFFSET+32(%r14) 125#define keysize 2*15*16(%arg1) 126#endif 127 128 129#define STATE1 %xmm0 130#define STATE2 %xmm4 131#define STATE3 %xmm5 132#define STATE4 %xmm6 133#define STATE STATE1 134#define IN1 %xmm1 135#define IN2 %xmm7 136#define IN3 %xmm8 137#define IN4 %xmm9 138#define IN IN1 139#define KEY %xmm2 140#define IV %xmm3 141 142#define BSWAP_MASK %xmm10 143#define CTR %xmm11 144#define INC %xmm12 145 146#define GF128MUL_MASK %xmm10 147 148#ifdef __x86_64__ 149#define AREG %rax 150#define KEYP %rdi 151#define OUTP %rsi 152#define UKEYP OUTP 153#define INP %rdx 154#define LEN %rcx 155#define IVP %r8 156#define KLEN %r9d 157#define T1 %r10 158#define TKEYP T1 159#define T2 %r11 160#define TCTR_LOW T2 161#else 162#define AREG %eax 163#define KEYP %edi 164#define OUTP AREG 165#define UKEYP OUTP 166#define INP %edx 167#define LEN %esi 168#define IVP %ebp 169#define KLEN %ebx 170#define T1 %ecx 171#define TKEYP T1 172#endif 173 174 175#ifdef __x86_64__ 176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 177* 178* 179* Input: A and B (128-bits each, bit-reflected) 180* Output: C = A*B*x mod poly, (i.e. >>1 ) 181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 183* 184*/ 185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 186 movdqa \GH, \TMP1 187 pshufd $78, \GH, \TMP2 188 pshufd $78, \HK, \TMP3 189 pxor \GH, \TMP2 # TMP2 = a1+a0 190 pxor \HK, \TMP3 # TMP3 = b1+b0 191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 194 pxor \GH, \TMP2 195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 196 movdqa \TMP2, \TMP3 197 pslldq $8, \TMP3 # left shift TMP3 2 DWs 198 psrldq $8, \TMP2 # right shift TMP2 2 DWs 199 pxor \TMP3, \GH 200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 201 202 # first phase of the reduction 203 204 movdqa \GH, \TMP2 205 movdqa \GH, \TMP3 206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 207 # in in order to perform 208 # independent shifts 209 pslld $31, \TMP2 # packed right shift <<31 210 pslld $30, \TMP3 # packed right shift <<30 211 pslld $25, \TMP4 # packed right shift <<25 212 pxor \TMP3, \TMP2 # xor the shifted versions 213 pxor \TMP4, \TMP2 214 movdqa \TMP2, \TMP5 215 psrldq $4, \TMP5 # right shift TMP5 1 DW 216 pslldq $12, \TMP2 # left shift TMP2 3 DWs 217 pxor \TMP2, \GH 218 219 # second phase of the reduction 220 221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 222 # in in order to perform 223 # independent shifts 224 movdqa \GH,\TMP3 225 movdqa \GH,\TMP4 226 psrld $1,\TMP2 # packed left shift >>1 227 psrld $2,\TMP3 # packed left shift >>2 228 psrld $7,\TMP4 # packed left shift >>7 229 pxor \TMP3,\TMP2 # xor the shifted versions 230 pxor \TMP4,\TMP2 231 pxor \TMP5, \TMP2 232 pxor \TMP2, \GH 233 pxor \TMP1, \GH # result is in TMP1 234.endm 235 236# Reads DLEN bytes starting at DPTR and stores in XMMDst 237# where 0 < DLEN < 16 238# Clobbers %rax, DLEN and XMM1 239.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 240 cmp $8, \DLEN 241 jl _read_lt8_\@ 242 mov (\DPTR), %rax 243 MOVQ_R64_XMM %rax, \XMMDst 244 sub $8, \DLEN 245 jz _done_read_partial_block_\@ 246 xor %eax, %eax 247_read_next_byte_\@: 248 shl $8, %rax 249 mov 7(\DPTR, \DLEN, 1), %al 250 dec \DLEN 251 jnz _read_next_byte_\@ 252 MOVQ_R64_XMM %rax, \XMM1 253 pslldq $8, \XMM1 254 por \XMM1, \XMMDst 255 jmp _done_read_partial_block_\@ 256_read_lt8_\@: 257 xor %eax, %eax 258_read_next_byte_lt8_\@: 259 shl $8, %rax 260 mov -1(\DPTR, \DLEN, 1), %al 261 dec \DLEN 262 jnz _read_next_byte_lt8_\@ 263 MOVQ_R64_XMM %rax, \XMMDst 264_done_read_partial_block_\@: 265.endm 266 267/* 268* if a = number of total plaintext bytes 269* b = floor(a/16) 270* num_initial_blocks = b mod 4 271* encrypt the initial num_initial_blocks blocks and apply ghash on 272* the ciphertext 273* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 274* are clobbered 275* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 276*/ 277 278 279.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 280XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 281 MOVADQ SHUF_MASK(%rip), %xmm14 282 mov arg7, %r10 # %r10 = AAD 283 mov arg8, %r11 # %r11 = aadLen 284 pxor %xmm\i, %xmm\i 285 pxor \XMM2, \XMM2 286 287 cmp $16, %r11 288 jl _get_AAD_rest\num_initial_blocks\operation 289_get_AAD_blocks\num_initial_blocks\operation: 290 movdqu (%r10), %xmm\i 291 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 292 pxor %xmm\i, \XMM2 293 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 294 add $16, %r10 295 sub $16, %r11 296 cmp $16, %r11 297 jge _get_AAD_blocks\num_initial_blocks\operation 298 299 movdqu \XMM2, %xmm\i 300 301 /* read the last <16B of AAD */ 302_get_AAD_rest\num_initial_blocks\operation: 303 cmp $0, %r11 304 je _get_AAD_done\num_initial_blocks\operation 305 306 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i 307 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 308 pxor \XMM2, %xmm\i 309 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 310 311_get_AAD_done\num_initial_blocks\operation: 312 xor %r11, %r11 # initialise the data pointer offset as zero 313 # start AES for num_initial_blocks blocks 314 315 mov %arg5, %rax # %rax = *Y0 316 movdqu (%rax), \XMM0 # XMM0 = Y0 317 PSHUFB_XMM %xmm14, \XMM0 318 319.if (\i == 5) || (\i == 6) || (\i == 7) 320 MOVADQ ONE(%RIP),\TMP1 321 MOVADQ (%arg1),\TMP2 322.irpc index, \i_seq 323 paddd \TMP1, \XMM0 # INCR Y0 324 movdqa \XMM0, %xmm\index 325 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 326 pxor \TMP2, %xmm\index 327.endr 328 lea 0x10(%arg1),%r10 329 mov keysize,%eax 330 shr $2,%eax # 128->4, 192->6, 256->8 331 add $5,%eax # 128->9, 192->11, 256->13 332 333aes_loop_initial_dec\num_initial_blocks: 334 MOVADQ (%r10),\TMP1 335.irpc index, \i_seq 336 AESENC \TMP1, %xmm\index 337.endr 338 add $16,%r10 339 sub $1,%eax 340 jnz aes_loop_initial_dec\num_initial_blocks 341 342 MOVADQ (%r10), \TMP1 343.irpc index, \i_seq 344 AESENCLAST \TMP1, %xmm\index # Last Round 345.endr 346.irpc index, \i_seq 347 movdqu (%arg3 , %r11, 1), \TMP1 348 pxor \TMP1, %xmm\index 349 movdqu %xmm\index, (%arg2 , %r11, 1) 350 # write back plaintext/ciphertext for num_initial_blocks 351 add $16, %r11 352 353 movdqa \TMP1, %xmm\index 354 PSHUFB_XMM %xmm14, %xmm\index 355 # prepare plaintext/ciphertext for GHASH computation 356.endr 357.endif 358 359 # apply GHASH on num_initial_blocks blocks 360 361.if \i == 5 362 pxor %xmm5, %xmm6 363 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 364 pxor %xmm6, %xmm7 365 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 366 pxor %xmm7, %xmm8 367 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 368.elseif \i == 6 369 pxor %xmm6, %xmm7 370 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 371 pxor %xmm7, %xmm8 372 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 373.elseif \i == 7 374 pxor %xmm7, %xmm8 375 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 376.endif 377 cmp $64, %r13 378 jl _initial_blocks_done\num_initial_blocks\operation 379 # no need for precomputed values 380/* 381* 382* Precomputations for HashKey parallel with encryption of first 4 blocks. 383* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 384*/ 385 MOVADQ ONE(%rip), \TMP1 386 paddd \TMP1, \XMM0 # INCR Y0 387 MOVADQ \XMM0, \XMM1 388 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 389 390 paddd \TMP1, \XMM0 # INCR Y0 391 MOVADQ \XMM0, \XMM2 392 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 393 394 paddd \TMP1, \XMM0 # INCR Y0 395 MOVADQ \XMM0, \XMM3 396 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 397 398 paddd \TMP1, \XMM0 # INCR Y0 399 MOVADQ \XMM0, \XMM4 400 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 401 402 MOVADQ 0(%arg1),\TMP1 403 pxor \TMP1, \XMM1 404 pxor \TMP1, \XMM2 405 pxor \TMP1, \XMM3 406 pxor \TMP1, \XMM4 407 movdqa \TMP3, \TMP5 408 pshufd $78, \TMP3, \TMP1 409 pxor \TMP3, \TMP1 410 movdqa \TMP1, HashKey_k(%rsp) 411 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 412# TMP5 = HashKey^2<<1 (mod poly) 413 movdqa \TMP5, HashKey_2(%rsp) 414# HashKey_2 = HashKey^2<<1 (mod poly) 415 pshufd $78, \TMP5, \TMP1 416 pxor \TMP5, \TMP1 417 movdqa \TMP1, HashKey_2_k(%rsp) 418.irpc index, 1234 # do 4 rounds 419 movaps 0x10*\index(%arg1), \TMP1 420 AESENC \TMP1, \XMM1 421 AESENC \TMP1, \XMM2 422 AESENC \TMP1, \XMM3 423 AESENC \TMP1, \XMM4 424.endr 425 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 426# TMP5 = HashKey^3<<1 (mod poly) 427 movdqa \TMP5, HashKey_3(%rsp) 428 pshufd $78, \TMP5, \TMP1 429 pxor \TMP5, \TMP1 430 movdqa \TMP1, HashKey_3_k(%rsp) 431.irpc index, 56789 # do next 5 rounds 432 movaps 0x10*\index(%arg1), \TMP1 433 AESENC \TMP1, \XMM1 434 AESENC \TMP1, \XMM2 435 AESENC \TMP1, \XMM3 436 AESENC \TMP1, \XMM4 437.endr 438 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 439# TMP5 = HashKey^3<<1 (mod poly) 440 movdqa \TMP5, HashKey_4(%rsp) 441 pshufd $78, \TMP5, \TMP1 442 pxor \TMP5, \TMP1 443 movdqa \TMP1, HashKey_4_k(%rsp) 444 lea 0xa0(%arg1),%r10 445 mov keysize,%eax 446 shr $2,%eax # 128->4, 192->6, 256->8 447 sub $4,%eax # 128->0, 192->2, 256->4 448 jz aes_loop_pre_dec_done\num_initial_blocks 449 450aes_loop_pre_dec\num_initial_blocks: 451 MOVADQ (%r10),\TMP2 452.irpc index, 1234 453 AESENC \TMP2, %xmm\index 454.endr 455 add $16,%r10 456 sub $1,%eax 457 jnz aes_loop_pre_dec\num_initial_blocks 458 459aes_loop_pre_dec_done\num_initial_blocks: 460 MOVADQ (%r10), \TMP2 461 AESENCLAST \TMP2, \XMM1 462 AESENCLAST \TMP2, \XMM2 463 AESENCLAST \TMP2, \XMM3 464 AESENCLAST \TMP2, \XMM4 465 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 466 pxor \TMP1, \XMM1 467 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 468 movdqa \TMP1, \XMM1 469 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 470 pxor \TMP1, \XMM2 471 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 472 movdqa \TMP1, \XMM2 473 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 474 pxor \TMP1, \XMM3 475 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 476 movdqa \TMP1, \XMM3 477 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 478 pxor \TMP1, \XMM4 479 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 480 movdqa \TMP1, \XMM4 481 add $64, %r11 482 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 483 pxor \XMMDst, \XMM1 484# combine GHASHed value with the corresponding ciphertext 485 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 486 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 487 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 488 489_initial_blocks_done\num_initial_blocks\operation: 490 491.endm 492 493 494/* 495* if a = number of total plaintext bytes 496* b = floor(a/16) 497* num_initial_blocks = b mod 4 498* encrypt the initial num_initial_blocks blocks and apply ghash on 499* the ciphertext 500* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 501* are clobbered 502* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 503*/ 504 505 506.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 507XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 508 MOVADQ SHUF_MASK(%rip), %xmm14 509 mov arg7, %r10 # %r10 = AAD 510 mov arg8, %r11 # %r11 = aadLen 511 pxor %xmm\i, %xmm\i 512 pxor \XMM2, \XMM2 513 514 cmp $16, %r11 515 jl _get_AAD_rest\num_initial_blocks\operation 516_get_AAD_blocks\num_initial_blocks\operation: 517 movdqu (%r10), %xmm\i 518 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 519 pxor %xmm\i, \XMM2 520 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 521 add $16, %r10 522 sub $16, %r11 523 cmp $16, %r11 524 jge _get_AAD_blocks\num_initial_blocks\operation 525 526 movdqu \XMM2, %xmm\i 527 528 /* read the last <16B of AAD */ 529_get_AAD_rest\num_initial_blocks\operation: 530 cmp $0, %r11 531 je _get_AAD_done\num_initial_blocks\operation 532 533 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i 534 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 535 pxor \XMM2, %xmm\i 536 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 537 538_get_AAD_done\num_initial_blocks\operation: 539 xor %r11, %r11 # initialise the data pointer offset as zero 540 # start AES for num_initial_blocks blocks 541 542 mov %arg5, %rax # %rax = *Y0 543 movdqu (%rax), \XMM0 # XMM0 = Y0 544 PSHUFB_XMM %xmm14, \XMM0 545 546.if (\i == 5) || (\i == 6) || (\i == 7) 547 548 MOVADQ ONE(%RIP),\TMP1 549 MOVADQ 0(%arg1),\TMP2 550.irpc index, \i_seq 551 paddd \TMP1, \XMM0 # INCR Y0 552 MOVADQ \XMM0, %xmm\index 553 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 554 pxor \TMP2, %xmm\index 555.endr 556 lea 0x10(%arg1),%r10 557 mov keysize,%eax 558 shr $2,%eax # 128->4, 192->6, 256->8 559 add $5,%eax # 128->9, 192->11, 256->13 560 561aes_loop_initial_enc\num_initial_blocks: 562 MOVADQ (%r10),\TMP1 563.irpc index, \i_seq 564 AESENC \TMP1, %xmm\index 565.endr 566 add $16,%r10 567 sub $1,%eax 568 jnz aes_loop_initial_enc\num_initial_blocks 569 570 MOVADQ (%r10), \TMP1 571.irpc index, \i_seq 572 AESENCLAST \TMP1, %xmm\index # Last Round 573.endr 574.irpc index, \i_seq 575 movdqu (%arg3 , %r11, 1), \TMP1 576 pxor \TMP1, %xmm\index 577 movdqu %xmm\index, (%arg2 , %r11, 1) 578 # write back plaintext/ciphertext for num_initial_blocks 579 add $16, %r11 580 PSHUFB_XMM %xmm14, %xmm\index 581 582 # prepare plaintext/ciphertext for GHASH computation 583.endr 584.endif 585 586 # apply GHASH on num_initial_blocks blocks 587 588.if \i == 5 589 pxor %xmm5, %xmm6 590 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 591 pxor %xmm6, %xmm7 592 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 593 pxor %xmm7, %xmm8 594 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 595.elseif \i == 6 596 pxor %xmm6, %xmm7 597 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 598 pxor %xmm7, %xmm8 599 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 600.elseif \i == 7 601 pxor %xmm7, %xmm8 602 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 603.endif 604 cmp $64, %r13 605 jl _initial_blocks_done\num_initial_blocks\operation 606 # no need for precomputed values 607/* 608* 609* Precomputations for HashKey parallel with encryption of first 4 blocks. 610* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 611*/ 612 MOVADQ ONE(%RIP),\TMP1 613 paddd \TMP1, \XMM0 # INCR Y0 614 MOVADQ \XMM0, \XMM1 615 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 616 617 paddd \TMP1, \XMM0 # INCR Y0 618 MOVADQ \XMM0, \XMM2 619 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 620 621 paddd \TMP1, \XMM0 # INCR Y0 622 MOVADQ \XMM0, \XMM3 623 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 624 625 paddd \TMP1, \XMM0 # INCR Y0 626 MOVADQ \XMM0, \XMM4 627 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 628 629 MOVADQ 0(%arg1),\TMP1 630 pxor \TMP1, \XMM1 631 pxor \TMP1, \XMM2 632 pxor \TMP1, \XMM3 633 pxor \TMP1, \XMM4 634 movdqa \TMP3, \TMP5 635 pshufd $78, \TMP3, \TMP1 636 pxor \TMP3, \TMP1 637 movdqa \TMP1, HashKey_k(%rsp) 638 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 639# TMP5 = HashKey^2<<1 (mod poly) 640 movdqa \TMP5, HashKey_2(%rsp) 641# HashKey_2 = HashKey^2<<1 (mod poly) 642 pshufd $78, \TMP5, \TMP1 643 pxor \TMP5, \TMP1 644 movdqa \TMP1, HashKey_2_k(%rsp) 645.irpc index, 1234 # do 4 rounds 646 movaps 0x10*\index(%arg1), \TMP1 647 AESENC \TMP1, \XMM1 648 AESENC \TMP1, \XMM2 649 AESENC \TMP1, \XMM3 650 AESENC \TMP1, \XMM4 651.endr 652 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 653# TMP5 = HashKey^3<<1 (mod poly) 654 movdqa \TMP5, HashKey_3(%rsp) 655 pshufd $78, \TMP5, \TMP1 656 pxor \TMP5, \TMP1 657 movdqa \TMP1, HashKey_3_k(%rsp) 658.irpc index, 56789 # do next 5 rounds 659 movaps 0x10*\index(%arg1), \TMP1 660 AESENC \TMP1, \XMM1 661 AESENC \TMP1, \XMM2 662 AESENC \TMP1, \XMM3 663 AESENC \TMP1, \XMM4 664.endr 665 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 666# TMP5 = HashKey^3<<1 (mod poly) 667 movdqa \TMP5, HashKey_4(%rsp) 668 pshufd $78, \TMP5, \TMP1 669 pxor \TMP5, \TMP1 670 movdqa \TMP1, HashKey_4_k(%rsp) 671 lea 0xa0(%arg1),%r10 672 mov keysize,%eax 673 shr $2,%eax # 128->4, 192->6, 256->8 674 sub $4,%eax # 128->0, 192->2, 256->4 675 jz aes_loop_pre_enc_done\num_initial_blocks 676 677aes_loop_pre_enc\num_initial_blocks: 678 MOVADQ (%r10),\TMP2 679.irpc index, 1234 680 AESENC \TMP2, %xmm\index 681.endr 682 add $16,%r10 683 sub $1,%eax 684 jnz aes_loop_pre_enc\num_initial_blocks 685 686aes_loop_pre_enc_done\num_initial_blocks: 687 MOVADQ (%r10), \TMP2 688 AESENCLAST \TMP2, \XMM1 689 AESENCLAST \TMP2, \XMM2 690 AESENCLAST \TMP2, \XMM3 691 AESENCLAST \TMP2, \XMM4 692 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 693 pxor \TMP1, \XMM1 694 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 695 pxor \TMP1, \XMM2 696 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 697 pxor \TMP1, \XMM3 698 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 699 pxor \TMP1, \XMM4 700 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 701 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 702 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 703 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 704 705 add $64, %r11 706 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 707 pxor \XMMDst, \XMM1 708# combine GHASHed value with the corresponding ciphertext 709 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 710 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 711 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 712 713_initial_blocks_done\num_initial_blocks\operation: 714 715.endm 716 717/* 718* encrypt 4 blocks at a time 719* ghash the 4 previously encrypted ciphertext blocks 720* arg1, %arg2, %arg3 are used as pointers only, not modified 721* %r11 is the data offset value 722*/ 723.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 724TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 725 726 movdqa \XMM1, \XMM5 727 movdqa \XMM2, \XMM6 728 movdqa \XMM3, \XMM7 729 movdqa \XMM4, \XMM8 730 731 movdqa SHUF_MASK(%rip), %xmm15 732 # multiply TMP5 * HashKey using karatsuba 733 734 movdqa \XMM5, \TMP4 735 pshufd $78, \XMM5, \TMP6 736 pxor \XMM5, \TMP6 737 paddd ONE(%rip), \XMM0 # INCR CNT 738 movdqa HashKey_4(%rsp), \TMP5 739 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 740 movdqa \XMM0, \XMM1 741 paddd ONE(%rip), \XMM0 # INCR CNT 742 movdqa \XMM0, \XMM2 743 paddd ONE(%rip), \XMM0 # INCR CNT 744 movdqa \XMM0, \XMM3 745 paddd ONE(%rip), \XMM0 # INCR CNT 746 movdqa \XMM0, \XMM4 747 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 748 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 749 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 750 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 751 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 752 753 pxor (%arg1), \XMM1 754 pxor (%arg1), \XMM2 755 pxor (%arg1), \XMM3 756 pxor (%arg1), \XMM4 757 movdqa HashKey_4_k(%rsp), \TMP5 758 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 759 movaps 0x10(%arg1), \TMP1 760 AESENC \TMP1, \XMM1 # Round 1 761 AESENC \TMP1, \XMM2 762 AESENC \TMP1, \XMM3 763 AESENC \TMP1, \XMM4 764 movaps 0x20(%arg1), \TMP1 765 AESENC \TMP1, \XMM1 # Round 2 766 AESENC \TMP1, \XMM2 767 AESENC \TMP1, \XMM3 768 AESENC \TMP1, \XMM4 769 movdqa \XMM6, \TMP1 770 pshufd $78, \XMM6, \TMP2 771 pxor \XMM6, \TMP2 772 movdqa HashKey_3(%rsp), \TMP5 773 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 774 movaps 0x30(%arg1), \TMP3 775 AESENC \TMP3, \XMM1 # Round 3 776 AESENC \TMP3, \XMM2 777 AESENC \TMP3, \XMM3 778 AESENC \TMP3, \XMM4 779 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 780 movaps 0x40(%arg1), \TMP3 781 AESENC \TMP3, \XMM1 # Round 4 782 AESENC \TMP3, \XMM2 783 AESENC \TMP3, \XMM3 784 AESENC \TMP3, \XMM4 785 movdqa HashKey_3_k(%rsp), \TMP5 786 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 787 movaps 0x50(%arg1), \TMP3 788 AESENC \TMP3, \XMM1 # Round 5 789 AESENC \TMP3, \XMM2 790 AESENC \TMP3, \XMM3 791 AESENC \TMP3, \XMM4 792 pxor \TMP1, \TMP4 793# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 794 pxor \XMM6, \XMM5 795 pxor \TMP2, \TMP6 796 movdqa \XMM7, \TMP1 797 pshufd $78, \XMM7, \TMP2 798 pxor \XMM7, \TMP2 799 movdqa HashKey_2(%rsp ), \TMP5 800 801 # Multiply TMP5 * HashKey using karatsuba 802 803 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 804 movaps 0x60(%arg1), \TMP3 805 AESENC \TMP3, \XMM1 # Round 6 806 AESENC \TMP3, \XMM2 807 AESENC \TMP3, \XMM3 808 AESENC \TMP3, \XMM4 809 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 810 movaps 0x70(%arg1), \TMP3 811 AESENC \TMP3, \XMM1 # Round 7 812 AESENC \TMP3, \XMM2 813 AESENC \TMP3, \XMM3 814 AESENC \TMP3, \XMM4 815 movdqa HashKey_2_k(%rsp), \TMP5 816 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 817 movaps 0x80(%arg1), \TMP3 818 AESENC \TMP3, \XMM1 # Round 8 819 AESENC \TMP3, \XMM2 820 AESENC \TMP3, \XMM3 821 AESENC \TMP3, \XMM4 822 pxor \TMP1, \TMP4 823# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 824 pxor \XMM7, \XMM5 825 pxor \TMP2, \TMP6 826 827 # Multiply XMM8 * HashKey 828 # XMM8 and TMP5 hold the values for the two operands 829 830 movdqa \XMM8, \TMP1 831 pshufd $78, \XMM8, \TMP2 832 pxor \XMM8, \TMP2 833 movdqa HashKey(%rsp), \TMP5 834 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 835 movaps 0x90(%arg1), \TMP3 836 AESENC \TMP3, \XMM1 # Round 9 837 AESENC \TMP3, \XMM2 838 AESENC \TMP3, \XMM3 839 AESENC \TMP3, \XMM4 840 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 841 lea 0xa0(%arg1),%r10 842 mov keysize,%eax 843 shr $2,%eax # 128->4, 192->6, 256->8 844 sub $4,%eax # 128->0, 192->2, 256->4 845 jz aes_loop_par_enc_done 846 847aes_loop_par_enc: 848 MOVADQ (%r10),\TMP3 849.irpc index, 1234 850 AESENC \TMP3, %xmm\index 851.endr 852 add $16,%r10 853 sub $1,%eax 854 jnz aes_loop_par_enc 855 856aes_loop_par_enc_done: 857 MOVADQ (%r10), \TMP3 858 AESENCLAST \TMP3, \XMM1 # Round 10 859 AESENCLAST \TMP3, \XMM2 860 AESENCLAST \TMP3, \XMM3 861 AESENCLAST \TMP3, \XMM4 862 movdqa HashKey_k(%rsp), \TMP5 863 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 864 movdqu (%arg3,%r11,1), \TMP3 865 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 866 movdqu 16(%arg3,%r11,1), \TMP3 867 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 868 movdqu 32(%arg3,%r11,1), \TMP3 869 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 870 movdqu 48(%arg3,%r11,1), \TMP3 871 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 872 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 873 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 874 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 875 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 876 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 877 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 878 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 879 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 880 881 pxor \TMP4, \TMP1 882 pxor \XMM8, \XMM5 883 pxor \TMP6, \TMP2 884 pxor \TMP1, \TMP2 885 pxor \XMM5, \TMP2 886 movdqa \TMP2, \TMP3 887 pslldq $8, \TMP3 # left shift TMP3 2 DWs 888 psrldq $8, \TMP2 # right shift TMP2 2 DWs 889 pxor \TMP3, \XMM5 890 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 891 892 # first phase of reduction 893 894 movdqa \XMM5, \TMP2 895 movdqa \XMM5, \TMP3 896 movdqa \XMM5, \TMP4 897# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 898 pslld $31, \TMP2 # packed right shift << 31 899 pslld $30, \TMP3 # packed right shift << 30 900 pslld $25, \TMP4 # packed right shift << 25 901 pxor \TMP3, \TMP2 # xor the shifted versions 902 pxor \TMP4, \TMP2 903 movdqa \TMP2, \TMP5 904 psrldq $4, \TMP5 # right shift T5 1 DW 905 pslldq $12, \TMP2 # left shift T2 3 DWs 906 pxor \TMP2, \XMM5 907 908 # second phase of reduction 909 910 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 911 movdqa \XMM5,\TMP3 912 movdqa \XMM5,\TMP4 913 psrld $1, \TMP2 # packed left shift >>1 914 psrld $2, \TMP3 # packed left shift >>2 915 psrld $7, \TMP4 # packed left shift >>7 916 pxor \TMP3,\TMP2 # xor the shifted versions 917 pxor \TMP4,\TMP2 918 pxor \TMP5, \TMP2 919 pxor \TMP2, \XMM5 920 pxor \TMP1, \XMM5 # result is in TMP1 921 922 pxor \XMM5, \XMM1 923.endm 924 925/* 926* decrypt 4 blocks at a time 927* ghash the 4 previously decrypted ciphertext blocks 928* arg1, %arg2, %arg3 are used as pointers only, not modified 929* %r11 is the data offset value 930*/ 931.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 932TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 933 934 movdqa \XMM1, \XMM5 935 movdqa \XMM2, \XMM6 936 movdqa \XMM3, \XMM7 937 movdqa \XMM4, \XMM8 938 939 movdqa SHUF_MASK(%rip), %xmm15 940 # multiply TMP5 * HashKey using karatsuba 941 942 movdqa \XMM5, \TMP4 943 pshufd $78, \XMM5, \TMP6 944 pxor \XMM5, \TMP6 945 paddd ONE(%rip), \XMM0 # INCR CNT 946 movdqa HashKey_4(%rsp), \TMP5 947 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 948 movdqa \XMM0, \XMM1 949 paddd ONE(%rip), \XMM0 # INCR CNT 950 movdqa \XMM0, \XMM2 951 paddd ONE(%rip), \XMM0 # INCR CNT 952 movdqa \XMM0, \XMM3 953 paddd ONE(%rip), \XMM0 # INCR CNT 954 movdqa \XMM0, \XMM4 955 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 956 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 957 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 958 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 959 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 960 961 pxor (%arg1), \XMM1 962 pxor (%arg1), \XMM2 963 pxor (%arg1), \XMM3 964 pxor (%arg1), \XMM4 965 movdqa HashKey_4_k(%rsp), \TMP5 966 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 967 movaps 0x10(%arg1), \TMP1 968 AESENC \TMP1, \XMM1 # Round 1 969 AESENC \TMP1, \XMM2 970 AESENC \TMP1, \XMM3 971 AESENC \TMP1, \XMM4 972 movaps 0x20(%arg1), \TMP1 973 AESENC \TMP1, \XMM1 # Round 2 974 AESENC \TMP1, \XMM2 975 AESENC \TMP1, \XMM3 976 AESENC \TMP1, \XMM4 977 movdqa \XMM6, \TMP1 978 pshufd $78, \XMM6, \TMP2 979 pxor \XMM6, \TMP2 980 movdqa HashKey_3(%rsp), \TMP5 981 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 982 movaps 0x30(%arg1), \TMP3 983 AESENC \TMP3, \XMM1 # Round 3 984 AESENC \TMP3, \XMM2 985 AESENC \TMP3, \XMM3 986 AESENC \TMP3, \XMM4 987 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 988 movaps 0x40(%arg1), \TMP3 989 AESENC \TMP3, \XMM1 # Round 4 990 AESENC \TMP3, \XMM2 991 AESENC \TMP3, \XMM3 992 AESENC \TMP3, \XMM4 993 movdqa HashKey_3_k(%rsp), \TMP5 994 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 995 movaps 0x50(%arg1), \TMP3 996 AESENC \TMP3, \XMM1 # Round 5 997 AESENC \TMP3, \XMM2 998 AESENC \TMP3, \XMM3 999 AESENC \TMP3, \XMM4 1000 pxor \TMP1, \TMP4 1001# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1002 pxor \XMM6, \XMM5 1003 pxor \TMP2, \TMP6 1004 movdqa \XMM7, \TMP1 1005 pshufd $78, \XMM7, \TMP2 1006 pxor \XMM7, \TMP2 1007 movdqa HashKey_2(%rsp ), \TMP5 1008 1009 # Multiply TMP5 * HashKey using karatsuba 1010 1011 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1012 movaps 0x60(%arg1), \TMP3 1013 AESENC \TMP3, \XMM1 # Round 6 1014 AESENC \TMP3, \XMM2 1015 AESENC \TMP3, \XMM3 1016 AESENC \TMP3, \XMM4 1017 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1018 movaps 0x70(%arg1), \TMP3 1019 AESENC \TMP3, \XMM1 # Round 7 1020 AESENC \TMP3, \XMM2 1021 AESENC \TMP3, \XMM3 1022 AESENC \TMP3, \XMM4 1023 movdqa HashKey_2_k(%rsp), \TMP5 1024 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1025 movaps 0x80(%arg1), \TMP3 1026 AESENC \TMP3, \XMM1 # Round 8 1027 AESENC \TMP3, \XMM2 1028 AESENC \TMP3, \XMM3 1029 AESENC \TMP3, \XMM4 1030 pxor \TMP1, \TMP4 1031# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1032 pxor \XMM7, \XMM5 1033 pxor \TMP2, \TMP6 1034 1035 # Multiply XMM8 * HashKey 1036 # XMM8 and TMP5 hold the values for the two operands 1037 1038 movdqa \XMM8, \TMP1 1039 pshufd $78, \XMM8, \TMP2 1040 pxor \XMM8, \TMP2 1041 movdqa HashKey(%rsp), \TMP5 1042 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1043 movaps 0x90(%arg1), \TMP3 1044 AESENC \TMP3, \XMM1 # Round 9 1045 AESENC \TMP3, \XMM2 1046 AESENC \TMP3, \XMM3 1047 AESENC \TMP3, \XMM4 1048 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1049 lea 0xa0(%arg1),%r10 1050 mov keysize,%eax 1051 shr $2,%eax # 128->4, 192->6, 256->8 1052 sub $4,%eax # 128->0, 192->2, 256->4 1053 jz aes_loop_par_dec_done 1054 1055aes_loop_par_dec: 1056 MOVADQ (%r10),\TMP3 1057.irpc index, 1234 1058 AESENC \TMP3, %xmm\index 1059.endr 1060 add $16,%r10 1061 sub $1,%eax 1062 jnz aes_loop_par_dec 1063 1064aes_loop_par_dec_done: 1065 MOVADQ (%r10), \TMP3 1066 AESENCLAST \TMP3, \XMM1 # last round 1067 AESENCLAST \TMP3, \XMM2 1068 AESENCLAST \TMP3, \XMM3 1069 AESENCLAST \TMP3, \XMM4 1070 movdqa HashKey_k(%rsp), \TMP5 1071 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1072 movdqu (%arg3,%r11,1), \TMP3 1073 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1074 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1075 movdqa \TMP3, \XMM1 1076 movdqu 16(%arg3,%r11,1), \TMP3 1077 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1078 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1079 movdqa \TMP3, \XMM2 1080 movdqu 32(%arg3,%r11,1), \TMP3 1081 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1082 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1083 movdqa \TMP3, \XMM3 1084 movdqu 48(%arg3,%r11,1), \TMP3 1085 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1086 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1087 movdqa \TMP3, \XMM4 1088 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1089 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1090 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1091 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1092 1093 pxor \TMP4, \TMP1 1094 pxor \XMM8, \XMM5 1095 pxor \TMP6, \TMP2 1096 pxor \TMP1, \TMP2 1097 pxor \XMM5, \TMP2 1098 movdqa \TMP2, \TMP3 1099 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1100 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1101 pxor \TMP3, \XMM5 1102 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1103 1104 # first phase of reduction 1105 1106 movdqa \XMM5, \TMP2 1107 movdqa \XMM5, \TMP3 1108 movdqa \XMM5, \TMP4 1109# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1110 pslld $31, \TMP2 # packed right shift << 31 1111 pslld $30, \TMP3 # packed right shift << 30 1112 pslld $25, \TMP4 # packed right shift << 25 1113 pxor \TMP3, \TMP2 # xor the shifted versions 1114 pxor \TMP4, \TMP2 1115 movdqa \TMP2, \TMP5 1116 psrldq $4, \TMP5 # right shift T5 1 DW 1117 pslldq $12, \TMP2 # left shift T2 3 DWs 1118 pxor \TMP2, \XMM5 1119 1120 # second phase of reduction 1121 1122 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1123 movdqa \XMM5,\TMP3 1124 movdqa \XMM5,\TMP4 1125 psrld $1, \TMP2 # packed left shift >>1 1126 psrld $2, \TMP3 # packed left shift >>2 1127 psrld $7, \TMP4 # packed left shift >>7 1128 pxor \TMP3,\TMP2 # xor the shifted versions 1129 pxor \TMP4,\TMP2 1130 pxor \TMP5, \TMP2 1131 pxor \TMP2, \XMM5 1132 pxor \TMP1, \XMM5 # result is in TMP1 1133 1134 pxor \XMM5, \XMM1 1135.endm 1136 1137/* GHASH the last 4 ciphertext blocks. */ 1138.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1139TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1140 1141 # Multiply TMP6 * HashKey (using Karatsuba) 1142 1143 movdqa \XMM1, \TMP6 1144 pshufd $78, \XMM1, \TMP2 1145 pxor \XMM1, \TMP2 1146 movdqa HashKey_4(%rsp), \TMP5 1147 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1148 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1149 movdqa HashKey_4_k(%rsp), \TMP4 1150 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1151 movdqa \XMM1, \XMMDst 1152 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1153 1154 # Multiply TMP1 * HashKey (using Karatsuba) 1155 1156 movdqa \XMM2, \TMP1 1157 pshufd $78, \XMM2, \TMP2 1158 pxor \XMM2, \TMP2 1159 movdqa HashKey_3(%rsp), \TMP5 1160 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1161 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1162 movdqa HashKey_3_k(%rsp), \TMP4 1163 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1164 pxor \TMP1, \TMP6 1165 pxor \XMM2, \XMMDst 1166 pxor \TMP2, \XMM1 1167# results accumulated in TMP6, XMMDst, XMM1 1168 1169 # Multiply TMP1 * HashKey (using Karatsuba) 1170 1171 movdqa \XMM3, \TMP1 1172 pshufd $78, \XMM3, \TMP2 1173 pxor \XMM3, \TMP2 1174 movdqa HashKey_2(%rsp), \TMP5 1175 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1176 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1177 movdqa HashKey_2_k(%rsp), \TMP4 1178 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1179 pxor \TMP1, \TMP6 1180 pxor \XMM3, \XMMDst 1181 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1182 1183 # Multiply TMP1 * HashKey (using Karatsuba) 1184 movdqa \XMM4, \TMP1 1185 pshufd $78, \XMM4, \TMP2 1186 pxor \XMM4, \TMP2 1187 movdqa HashKey(%rsp), \TMP5 1188 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1189 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1190 movdqa HashKey_k(%rsp), \TMP4 1191 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1192 pxor \TMP1, \TMP6 1193 pxor \XMM4, \XMMDst 1194 pxor \XMM1, \TMP2 1195 pxor \TMP6, \TMP2 1196 pxor \XMMDst, \TMP2 1197 # middle section of the temp results combined as in karatsuba algorithm 1198 movdqa \TMP2, \TMP4 1199 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1200 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1201 pxor \TMP4, \XMMDst 1202 pxor \TMP2, \TMP6 1203# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1204 # first phase of the reduction 1205 movdqa \XMMDst, \TMP2 1206 movdqa \XMMDst, \TMP3 1207 movdqa \XMMDst, \TMP4 1208# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1209 pslld $31, \TMP2 # packed right shifting << 31 1210 pslld $30, \TMP3 # packed right shifting << 30 1211 pslld $25, \TMP4 # packed right shifting << 25 1212 pxor \TMP3, \TMP2 # xor the shifted versions 1213 pxor \TMP4, \TMP2 1214 movdqa \TMP2, \TMP7 1215 psrldq $4, \TMP7 # right shift TMP7 1 DW 1216 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1217 pxor \TMP2, \XMMDst 1218 1219 # second phase of the reduction 1220 movdqa \XMMDst, \TMP2 1221 # make 3 copies of XMMDst for doing 3 shift operations 1222 movdqa \XMMDst, \TMP3 1223 movdqa \XMMDst, \TMP4 1224 psrld $1, \TMP2 # packed left shift >> 1 1225 psrld $2, \TMP3 # packed left shift >> 2 1226 psrld $7, \TMP4 # packed left shift >> 7 1227 pxor \TMP3, \TMP2 # xor the shifted versions 1228 pxor \TMP4, \TMP2 1229 pxor \TMP7, \TMP2 1230 pxor \TMP2, \XMMDst 1231 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1232.endm 1233 1234 1235/* Encryption of a single block 1236* uses eax & r10 1237*/ 1238 1239.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1240 1241 pxor (%arg1), \XMM0 1242 mov keysize,%eax 1243 shr $2,%eax # 128->4, 192->6, 256->8 1244 add $5,%eax # 128->9, 192->11, 256->13 1245 lea 16(%arg1), %r10 # get first expanded key address 1246 1247_esb_loop_\@: 1248 MOVADQ (%r10),\TMP1 1249 AESENC \TMP1,\XMM0 1250 add $16,%r10 1251 sub $1,%eax 1252 jnz _esb_loop_\@ 1253 1254 MOVADQ (%r10),\TMP1 1255 AESENCLAST \TMP1,\XMM0 1256.endm 1257/***************************************************************************** 1258* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1259* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1260* const u8 *in, // Ciphertext input 1261* u64 plaintext_len, // Length of data in bytes for decryption. 1262* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1263* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1264* // concatenated with 0x00000001. 16-byte aligned pointer. 1265* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1266* const u8 *aad, // Additional Authentication Data (AAD) 1267* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1268* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1269* // given authentication tag and only return the plaintext if they match. 1270* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1271* // (most likely), 12 or 8. 1272* 1273* Assumptions: 1274* 1275* keys: 1276* keys are pre-expanded and aligned to 16 bytes. we are using the first 1277* set of 11 keys in the data structure void *aes_ctx 1278* 1279* iv: 1280* 0 1 2 3 1281* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1282* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1283* | Salt (From the SA) | 1284* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1285* | Initialization Vector | 1286* | (This is the sequence number from IPSec header) | 1287* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1288* | 0x1 | 1289* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1290* 1291* 1292* 1293* AAD: 1294* AAD padded to 128 bits with 0 1295* for example, assume AAD is a u32 vector 1296* 1297* if AAD is 8 bytes: 1298* AAD[3] = {A0, A1}; 1299* padded AAD in xmm register = {A1 A0 0 0} 1300* 1301* 0 1 2 3 1302* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1303* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1304* | SPI (A1) | 1305* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1306* | 32-bit Sequence Number (A0) | 1307* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1308* | 0x0 | 1309* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1310* 1311* AAD Format with 32-bit Sequence Number 1312* 1313* if AAD is 12 bytes: 1314* AAD[3] = {A0, A1, A2}; 1315* padded AAD in xmm register = {A2 A1 A0 0} 1316* 1317* 0 1 2 3 1318* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1319* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1320* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1321* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1322* | SPI (A2) | 1323* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1324* | 64-bit Extended Sequence Number {A1,A0} | 1325* | | 1326* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1327* | 0x0 | 1328* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1329* 1330* AAD Format with 64-bit Extended Sequence Number 1331* 1332* poly = x^128 + x^127 + x^126 + x^121 + 1 1333* 1334*****************************************************************************/ 1335ENTRY(aesni_gcm_dec) 1336 push %r12 1337 push %r13 1338 push %r14 1339 mov %rsp, %r14 1340/* 1341* states of %xmm registers %xmm6:%xmm15 not saved 1342* all %xmm registers are clobbered 1343*/ 1344 sub $VARIABLE_OFFSET, %rsp 1345 and $~63, %rsp # align rsp to 64 bytes 1346 mov %arg6, %r12 1347 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1348 movdqa SHUF_MASK(%rip), %xmm2 1349 PSHUFB_XMM %xmm2, %xmm13 1350 1351 1352# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1353 1354 movdqa %xmm13, %xmm2 1355 psllq $1, %xmm13 1356 psrlq $63, %xmm2 1357 movdqa %xmm2, %xmm1 1358 pslldq $8, %xmm2 1359 psrldq $8, %xmm1 1360 por %xmm2, %xmm13 1361 1362 # Reduction 1363 1364 pshufd $0x24, %xmm1, %xmm2 1365 pcmpeqd TWOONE(%rip), %xmm2 1366 pand POLY(%rip), %xmm2 1367 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1368 1369 1370 # Decrypt first few blocks 1371 1372 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1373 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1374 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1375 mov %r13, %r12 1376 and $(3<<4), %r12 1377 jz _initial_num_blocks_is_0_decrypt 1378 cmp $(2<<4), %r12 1379 jb _initial_num_blocks_is_1_decrypt 1380 je _initial_num_blocks_is_2_decrypt 1381_initial_num_blocks_is_3_decrypt: 1382 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1383%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1384 sub $48, %r13 1385 jmp _initial_blocks_decrypted 1386_initial_num_blocks_is_2_decrypt: 1387 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1388%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1389 sub $32, %r13 1390 jmp _initial_blocks_decrypted 1391_initial_num_blocks_is_1_decrypt: 1392 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1393%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1394 sub $16, %r13 1395 jmp _initial_blocks_decrypted 1396_initial_num_blocks_is_0_decrypt: 1397 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1398%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1399_initial_blocks_decrypted: 1400 cmp $0, %r13 1401 je _zero_cipher_left_decrypt 1402 sub $64, %r13 1403 je _four_cipher_left_decrypt 1404_decrypt_by_4: 1405 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1406%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1407 add $64, %r11 1408 sub $64, %r13 1409 jne _decrypt_by_4 1410_four_cipher_left_decrypt: 1411 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1412%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1413_zero_cipher_left_decrypt: 1414 mov %arg4, %r13 1415 and $15, %r13 # %r13 = arg4 (mod 16) 1416 je _multiple_of_16_bytes_decrypt 1417 1418 # Handle the last <16 byte block separately 1419 1420 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1421 movdqa SHUF_MASK(%rip), %xmm10 1422 PSHUFB_XMM %xmm10, %xmm0 1423 1424 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1425 1426 lea (%arg3,%r11,1), %r10 1427 mov %r13, %r12 1428 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 1429 1430 lea ALL_F+16(%rip), %r12 1431 sub %r13, %r12 1432 movdqa %xmm1, %xmm2 1433 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1434 movdqu (%r12), %xmm1 1435 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1436 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1437 pand %xmm1, %xmm2 1438 movdqa SHUF_MASK(%rip), %xmm10 1439 PSHUFB_XMM %xmm10 ,%xmm2 1440 1441 pxor %xmm2, %xmm8 1442 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1443 1444 # output %r13 bytes 1445 MOVQ_R64_XMM %xmm0, %rax 1446 cmp $8, %r13 1447 jle _less_than_8_bytes_left_decrypt 1448 mov %rax, (%arg2 , %r11, 1) 1449 add $8, %r11 1450 psrldq $8, %xmm0 1451 MOVQ_R64_XMM %xmm0, %rax 1452 sub $8, %r13 1453_less_than_8_bytes_left_decrypt: 1454 mov %al, (%arg2, %r11, 1) 1455 add $1, %r11 1456 shr $8, %rax 1457 sub $1, %r13 1458 jne _less_than_8_bytes_left_decrypt 1459_multiple_of_16_bytes_decrypt: 1460 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1461 shl $3, %r12 # convert into number of bits 1462 movd %r12d, %xmm15 # len(A) in %xmm15 1463 shl $3, %arg4 # len(C) in bits (*128) 1464 MOVQ_R64_XMM %arg4, %xmm1 1465 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1466 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1467 pxor %xmm15, %xmm8 1468 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1469 # final GHASH computation 1470 movdqa SHUF_MASK(%rip), %xmm10 1471 PSHUFB_XMM %xmm10, %xmm8 1472 1473 mov %arg5, %rax # %rax = *Y0 1474 movdqu (%rax), %xmm0 # %xmm0 = Y0 1475 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1476 pxor %xmm8, %xmm0 1477_return_T_decrypt: 1478 mov arg9, %r10 # %r10 = authTag 1479 mov arg10, %r11 # %r11 = auth_tag_len 1480 cmp $16, %r11 1481 je _T_16_decrypt 1482 cmp $8, %r11 1483 jl _T_4_decrypt 1484_T_8_decrypt: 1485 MOVQ_R64_XMM %xmm0, %rax 1486 mov %rax, (%r10) 1487 add $8, %r10 1488 sub $8, %r11 1489 psrldq $8, %xmm0 1490 cmp $0, %r11 1491 je _return_T_done_decrypt 1492_T_4_decrypt: 1493 movd %xmm0, %eax 1494 mov %eax, (%r10) 1495 add $4, %r10 1496 sub $4, %r11 1497 psrldq $4, %xmm0 1498 cmp $0, %r11 1499 je _return_T_done_decrypt 1500_T_123_decrypt: 1501 movd %xmm0, %eax 1502 cmp $2, %r11 1503 jl _T_1_decrypt 1504 mov %ax, (%r10) 1505 cmp $2, %r11 1506 je _return_T_done_decrypt 1507 add $2, %r10 1508 sar $16, %eax 1509_T_1_decrypt: 1510 mov %al, (%r10) 1511 jmp _return_T_done_decrypt 1512_T_16_decrypt: 1513 movdqu %xmm0, (%r10) 1514_return_T_done_decrypt: 1515 mov %r14, %rsp 1516 pop %r14 1517 pop %r13 1518 pop %r12 1519 ret 1520ENDPROC(aesni_gcm_dec) 1521 1522 1523/***************************************************************************** 1524* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1525* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1526* const u8 *in, // Plaintext input 1527* u64 plaintext_len, // Length of data in bytes for encryption. 1528* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1529* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1530* // concatenated with 0x00000001. 16-byte aligned pointer. 1531* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1532* const u8 *aad, // Additional Authentication Data (AAD) 1533* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1534* u8 *auth_tag, // Authenticated Tag output. 1535* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1536* // 12 or 8. 1537* 1538* Assumptions: 1539* 1540* keys: 1541* keys are pre-expanded and aligned to 16 bytes. we are using the 1542* first set of 11 keys in the data structure void *aes_ctx 1543* 1544* 1545* iv: 1546* 0 1 2 3 1547* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1549* | Salt (From the SA) | 1550* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1551* | Initialization Vector | 1552* | (This is the sequence number from IPSec header) | 1553* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1554* | 0x1 | 1555* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1556* 1557* 1558* 1559* AAD: 1560* AAD padded to 128 bits with 0 1561* for example, assume AAD is a u32 vector 1562* 1563* if AAD is 8 bytes: 1564* AAD[3] = {A0, A1}; 1565* padded AAD in xmm register = {A1 A0 0 0} 1566* 1567* 0 1 2 3 1568* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1570* | SPI (A1) | 1571* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1572* | 32-bit Sequence Number (A0) | 1573* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1574* | 0x0 | 1575* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1576* 1577* AAD Format with 32-bit Sequence Number 1578* 1579* if AAD is 12 bytes: 1580* AAD[3] = {A0, A1, A2}; 1581* padded AAD in xmm register = {A2 A1 A0 0} 1582* 1583* 0 1 2 3 1584* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1586* | SPI (A2) | 1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1588* | 64-bit Extended Sequence Number {A1,A0} | 1589* | | 1590* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1591* | 0x0 | 1592* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1593* 1594* AAD Format with 64-bit Extended Sequence Number 1595* 1596* poly = x^128 + x^127 + x^126 + x^121 + 1 1597***************************************************************************/ 1598ENTRY(aesni_gcm_enc) 1599 push %r12 1600 push %r13 1601 push %r14 1602 mov %rsp, %r14 1603# 1604# states of %xmm registers %xmm6:%xmm15 not saved 1605# all %xmm registers are clobbered 1606# 1607 sub $VARIABLE_OFFSET, %rsp 1608 and $~63, %rsp 1609 mov %arg6, %r12 1610 movdqu (%r12), %xmm13 1611 movdqa SHUF_MASK(%rip), %xmm2 1612 PSHUFB_XMM %xmm2, %xmm13 1613 1614 1615# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1616 1617 movdqa %xmm13, %xmm2 1618 psllq $1, %xmm13 1619 psrlq $63, %xmm2 1620 movdqa %xmm2, %xmm1 1621 pslldq $8, %xmm2 1622 psrldq $8, %xmm1 1623 por %xmm2, %xmm13 1624 1625 # reduce HashKey<<1 1626 1627 pshufd $0x24, %xmm1, %xmm2 1628 pcmpeqd TWOONE(%rip), %xmm2 1629 pand POLY(%rip), %xmm2 1630 pxor %xmm2, %xmm13 1631 movdqa %xmm13, HashKey(%rsp) 1632 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1633 and $-16, %r13 1634 mov %r13, %r12 1635 1636 # Encrypt first few blocks 1637 1638 and $(3<<4), %r12 1639 jz _initial_num_blocks_is_0_encrypt 1640 cmp $(2<<4), %r12 1641 jb _initial_num_blocks_is_1_encrypt 1642 je _initial_num_blocks_is_2_encrypt 1643_initial_num_blocks_is_3_encrypt: 1644 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1645%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1646 sub $48, %r13 1647 jmp _initial_blocks_encrypted 1648_initial_num_blocks_is_2_encrypt: 1649 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1650%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1651 sub $32, %r13 1652 jmp _initial_blocks_encrypted 1653_initial_num_blocks_is_1_encrypt: 1654 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1655%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1656 sub $16, %r13 1657 jmp _initial_blocks_encrypted 1658_initial_num_blocks_is_0_encrypt: 1659 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1660%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1661_initial_blocks_encrypted: 1662 1663 # Main loop - Encrypt remaining blocks 1664 1665 cmp $0, %r13 1666 je _zero_cipher_left_encrypt 1667 sub $64, %r13 1668 je _four_cipher_left_encrypt 1669_encrypt_by_4_encrypt: 1670 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1671%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1672 add $64, %r11 1673 sub $64, %r13 1674 jne _encrypt_by_4_encrypt 1675_four_cipher_left_encrypt: 1676 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1677%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1678_zero_cipher_left_encrypt: 1679 mov %arg4, %r13 1680 and $15, %r13 # %r13 = arg4 (mod 16) 1681 je _multiple_of_16_bytes_encrypt 1682 1683 # Handle the last <16 Byte block separately 1684 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1685 movdqa SHUF_MASK(%rip), %xmm10 1686 PSHUFB_XMM %xmm10, %xmm0 1687 1688 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1689 1690 lea (%arg3,%r11,1), %r10 1691 mov %r13, %r12 1692 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 1693 1694 lea ALL_F+16(%rip), %r12 1695 sub %r13, %r12 1696 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1697 movdqu (%r12), %xmm1 1698 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1699 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1700 movdqa SHUF_MASK(%rip), %xmm10 1701 PSHUFB_XMM %xmm10,%xmm0 1702 1703 pxor %xmm0, %xmm8 1704 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1705 # GHASH computation for the last <16 byte block 1706 movdqa SHUF_MASK(%rip), %xmm10 1707 PSHUFB_XMM %xmm10, %xmm0 1708 1709 # shuffle xmm0 back to output as ciphertext 1710 1711 # Output %r13 bytes 1712 MOVQ_R64_XMM %xmm0, %rax 1713 cmp $8, %r13 1714 jle _less_than_8_bytes_left_encrypt 1715 mov %rax, (%arg2 , %r11, 1) 1716 add $8, %r11 1717 psrldq $8, %xmm0 1718 MOVQ_R64_XMM %xmm0, %rax 1719 sub $8, %r13 1720_less_than_8_bytes_left_encrypt: 1721 mov %al, (%arg2, %r11, 1) 1722 add $1, %r11 1723 shr $8, %rax 1724 sub $1, %r13 1725 jne _less_than_8_bytes_left_encrypt 1726_multiple_of_16_bytes_encrypt: 1727 mov arg8, %r12 # %r12 = addLen (number of bytes) 1728 shl $3, %r12 1729 movd %r12d, %xmm15 # len(A) in %xmm15 1730 shl $3, %arg4 # len(C) in bits (*128) 1731 MOVQ_R64_XMM %arg4, %xmm1 1732 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1733 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1734 pxor %xmm15, %xmm8 1735 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1736 # final GHASH computation 1737 movdqa SHUF_MASK(%rip), %xmm10 1738 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1739 1740 mov %arg5, %rax # %rax = *Y0 1741 movdqu (%rax), %xmm0 # %xmm0 = Y0 1742 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1743 pxor %xmm8, %xmm0 1744_return_T_encrypt: 1745 mov arg9, %r10 # %r10 = authTag 1746 mov arg10, %r11 # %r11 = auth_tag_len 1747 cmp $16, %r11 1748 je _T_16_encrypt 1749 cmp $8, %r11 1750 jl _T_4_encrypt 1751_T_8_encrypt: 1752 MOVQ_R64_XMM %xmm0, %rax 1753 mov %rax, (%r10) 1754 add $8, %r10 1755 sub $8, %r11 1756 psrldq $8, %xmm0 1757 cmp $0, %r11 1758 je _return_T_done_encrypt 1759_T_4_encrypt: 1760 movd %xmm0, %eax 1761 mov %eax, (%r10) 1762 add $4, %r10 1763 sub $4, %r11 1764 psrldq $4, %xmm0 1765 cmp $0, %r11 1766 je _return_T_done_encrypt 1767_T_123_encrypt: 1768 movd %xmm0, %eax 1769 cmp $2, %r11 1770 jl _T_1_encrypt 1771 mov %ax, (%r10) 1772 cmp $2, %r11 1773 je _return_T_done_encrypt 1774 add $2, %r10 1775 sar $16, %eax 1776_T_1_encrypt: 1777 mov %al, (%r10) 1778 jmp _return_T_done_encrypt 1779_T_16_encrypt: 1780 movdqu %xmm0, (%r10) 1781_return_T_done_encrypt: 1782 mov %r14, %rsp 1783 pop %r14 1784 pop %r13 1785 pop %r12 1786 ret 1787ENDPROC(aesni_gcm_enc) 1788 1789#endif 1790 1791 1792.align 4 1793_key_expansion_128: 1794_key_expansion_256a: 1795 pshufd $0b11111111, %xmm1, %xmm1 1796 shufps $0b00010000, %xmm0, %xmm4 1797 pxor %xmm4, %xmm0 1798 shufps $0b10001100, %xmm0, %xmm4 1799 pxor %xmm4, %xmm0 1800 pxor %xmm1, %xmm0 1801 movaps %xmm0, (TKEYP) 1802 add $0x10, TKEYP 1803 ret 1804ENDPROC(_key_expansion_128) 1805ENDPROC(_key_expansion_256a) 1806 1807.align 4 1808_key_expansion_192a: 1809 pshufd $0b01010101, %xmm1, %xmm1 1810 shufps $0b00010000, %xmm0, %xmm4 1811 pxor %xmm4, %xmm0 1812 shufps $0b10001100, %xmm0, %xmm4 1813 pxor %xmm4, %xmm0 1814 pxor %xmm1, %xmm0 1815 1816 movaps %xmm2, %xmm5 1817 movaps %xmm2, %xmm6 1818 pslldq $4, %xmm5 1819 pshufd $0b11111111, %xmm0, %xmm3 1820 pxor %xmm3, %xmm2 1821 pxor %xmm5, %xmm2 1822 1823 movaps %xmm0, %xmm1 1824 shufps $0b01000100, %xmm0, %xmm6 1825 movaps %xmm6, (TKEYP) 1826 shufps $0b01001110, %xmm2, %xmm1 1827 movaps %xmm1, 0x10(TKEYP) 1828 add $0x20, TKEYP 1829 ret 1830ENDPROC(_key_expansion_192a) 1831 1832.align 4 1833_key_expansion_192b: 1834 pshufd $0b01010101, %xmm1, %xmm1 1835 shufps $0b00010000, %xmm0, %xmm4 1836 pxor %xmm4, %xmm0 1837 shufps $0b10001100, %xmm0, %xmm4 1838 pxor %xmm4, %xmm0 1839 pxor %xmm1, %xmm0 1840 1841 movaps %xmm2, %xmm5 1842 pslldq $4, %xmm5 1843 pshufd $0b11111111, %xmm0, %xmm3 1844 pxor %xmm3, %xmm2 1845 pxor %xmm5, %xmm2 1846 1847 movaps %xmm0, (TKEYP) 1848 add $0x10, TKEYP 1849 ret 1850ENDPROC(_key_expansion_192b) 1851 1852.align 4 1853_key_expansion_256b: 1854 pshufd $0b10101010, %xmm1, %xmm1 1855 shufps $0b00010000, %xmm2, %xmm4 1856 pxor %xmm4, %xmm2 1857 shufps $0b10001100, %xmm2, %xmm4 1858 pxor %xmm4, %xmm2 1859 pxor %xmm1, %xmm2 1860 movaps %xmm2, (TKEYP) 1861 add $0x10, TKEYP 1862 ret 1863ENDPROC(_key_expansion_256b) 1864 1865/* 1866 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1867 * unsigned int key_len) 1868 */ 1869ENTRY(aesni_set_key) 1870 FRAME_BEGIN 1871#ifndef __x86_64__ 1872 pushl KEYP 1873 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1874 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1875 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1876#endif 1877 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1878 movaps %xmm0, (KEYP) 1879 lea 0x10(KEYP), TKEYP # key addr 1880 movl %edx, 480(KEYP) 1881 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1882 cmp $24, %dl 1883 jb .Lenc_key128 1884 je .Lenc_key192 1885 movups 0x10(UKEYP), %xmm2 # other user key 1886 movaps %xmm2, (TKEYP) 1887 add $0x10, TKEYP 1888 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1889 call _key_expansion_256a 1890 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1891 call _key_expansion_256b 1892 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1893 call _key_expansion_256a 1894 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1895 call _key_expansion_256b 1896 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1897 call _key_expansion_256a 1898 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1899 call _key_expansion_256b 1900 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1901 call _key_expansion_256a 1902 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1903 call _key_expansion_256b 1904 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1905 call _key_expansion_256a 1906 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1907 call _key_expansion_256b 1908 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1909 call _key_expansion_256a 1910 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1911 call _key_expansion_256b 1912 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1913 call _key_expansion_256a 1914 jmp .Ldec_key 1915.Lenc_key192: 1916 movq 0x10(UKEYP), %xmm2 # other user key 1917 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1918 call _key_expansion_192a 1919 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1920 call _key_expansion_192b 1921 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1922 call _key_expansion_192a 1923 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1924 call _key_expansion_192b 1925 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1926 call _key_expansion_192a 1927 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1928 call _key_expansion_192b 1929 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1930 call _key_expansion_192a 1931 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1932 call _key_expansion_192b 1933 jmp .Ldec_key 1934.Lenc_key128: 1935 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1936 call _key_expansion_128 1937 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1938 call _key_expansion_128 1939 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1940 call _key_expansion_128 1941 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1942 call _key_expansion_128 1943 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1944 call _key_expansion_128 1945 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1946 call _key_expansion_128 1947 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1948 call _key_expansion_128 1949 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1950 call _key_expansion_128 1951 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1952 call _key_expansion_128 1953 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1954 call _key_expansion_128 1955.Ldec_key: 1956 sub $0x10, TKEYP 1957 movaps (KEYP), %xmm0 1958 movaps (TKEYP), %xmm1 1959 movaps %xmm0, 240(TKEYP) 1960 movaps %xmm1, 240(KEYP) 1961 add $0x10, KEYP 1962 lea 240-16(TKEYP), UKEYP 1963.align 4 1964.Ldec_key_loop: 1965 movaps (KEYP), %xmm0 1966 AESIMC %xmm0 %xmm1 1967 movaps %xmm1, (UKEYP) 1968 add $0x10, KEYP 1969 sub $0x10, UKEYP 1970 cmp TKEYP, KEYP 1971 jb .Ldec_key_loop 1972 xor AREG, AREG 1973#ifndef __x86_64__ 1974 popl KEYP 1975#endif 1976 FRAME_END 1977 ret 1978ENDPROC(aesni_set_key) 1979 1980/* 1981 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1982 */ 1983ENTRY(aesni_enc) 1984 FRAME_BEGIN 1985#ifndef __x86_64__ 1986 pushl KEYP 1987 pushl KLEN 1988 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1989 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1990 movl (FRAME_OFFSET+20)(%esp), INP # src 1991#endif 1992 movl 480(KEYP), KLEN # key length 1993 movups (INP), STATE # input 1994 call _aesni_enc1 1995 movups STATE, (OUTP) # output 1996#ifndef __x86_64__ 1997 popl KLEN 1998 popl KEYP 1999#endif 2000 FRAME_END 2001 ret 2002ENDPROC(aesni_enc) 2003 2004/* 2005 * _aesni_enc1: internal ABI 2006 * input: 2007 * KEYP: key struct pointer 2008 * KLEN: round count 2009 * STATE: initial state (input) 2010 * output: 2011 * STATE: finial state (output) 2012 * changed: 2013 * KEY 2014 * TKEYP (T1) 2015 */ 2016.align 4 2017_aesni_enc1: 2018 movaps (KEYP), KEY # key 2019 mov KEYP, TKEYP 2020 pxor KEY, STATE # round 0 2021 add $0x30, TKEYP 2022 cmp $24, KLEN 2023 jb .Lenc128 2024 lea 0x20(TKEYP), TKEYP 2025 je .Lenc192 2026 add $0x20, TKEYP 2027 movaps -0x60(TKEYP), KEY 2028 AESENC KEY STATE 2029 movaps -0x50(TKEYP), KEY 2030 AESENC KEY STATE 2031.align 4 2032.Lenc192: 2033 movaps -0x40(TKEYP), KEY 2034 AESENC KEY STATE 2035 movaps -0x30(TKEYP), KEY 2036 AESENC KEY STATE 2037.align 4 2038.Lenc128: 2039 movaps -0x20(TKEYP), KEY 2040 AESENC KEY STATE 2041 movaps -0x10(TKEYP), KEY 2042 AESENC KEY STATE 2043 movaps (TKEYP), KEY 2044 AESENC KEY STATE 2045 movaps 0x10(TKEYP), KEY 2046 AESENC KEY STATE 2047 movaps 0x20(TKEYP), KEY 2048 AESENC KEY STATE 2049 movaps 0x30(TKEYP), KEY 2050 AESENC KEY STATE 2051 movaps 0x40(TKEYP), KEY 2052 AESENC KEY STATE 2053 movaps 0x50(TKEYP), KEY 2054 AESENC KEY STATE 2055 movaps 0x60(TKEYP), KEY 2056 AESENC KEY STATE 2057 movaps 0x70(TKEYP), KEY 2058 AESENCLAST KEY STATE 2059 ret 2060ENDPROC(_aesni_enc1) 2061 2062/* 2063 * _aesni_enc4: internal ABI 2064 * input: 2065 * KEYP: key struct pointer 2066 * KLEN: round count 2067 * STATE1: initial state (input) 2068 * STATE2 2069 * STATE3 2070 * STATE4 2071 * output: 2072 * STATE1: finial state (output) 2073 * STATE2 2074 * STATE3 2075 * STATE4 2076 * changed: 2077 * KEY 2078 * TKEYP (T1) 2079 */ 2080.align 4 2081_aesni_enc4: 2082 movaps (KEYP), KEY # key 2083 mov KEYP, TKEYP 2084 pxor KEY, STATE1 # round 0 2085 pxor KEY, STATE2 2086 pxor KEY, STATE3 2087 pxor KEY, STATE4 2088 add $0x30, TKEYP 2089 cmp $24, KLEN 2090 jb .L4enc128 2091 lea 0x20(TKEYP), TKEYP 2092 je .L4enc192 2093 add $0x20, TKEYP 2094 movaps -0x60(TKEYP), KEY 2095 AESENC KEY STATE1 2096 AESENC KEY STATE2 2097 AESENC KEY STATE3 2098 AESENC KEY STATE4 2099 movaps -0x50(TKEYP), KEY 2100 AESENC KEY STATE1 2101 AESENC KEY STATE2 2102 AESENC KEY STATE3 2103 AESENC KEY STATE4 2104#.align 4 2105.L4enc192: 2106 movaps -0x40(TKEYP), KEY 2107 AESENC KEY STATE1 2108 AESENC KEY STATE2 2109 AESENC KEY STATE3 2110 AESENC KEY STATE4 2111 movaps -0x30(TKEYP), KEY 2112 AESENC KEY STATE1 2113 AESENC KEY STATE2 2114 AESENC KEY STATE3 2115 AESENC KEY STATE4 2116#.align 4 2117.L4enc128: 2118 movaps -0x20(TKEYP), KEY 2119 AESENC KEY STATE1 2120 AESENC KEY STATE2 2121 AESENC KEY STATE3 2122 AESENC KEY STATE4 2123 movaps -0x10(TKEYP), KEY 2124 AESENC KEY STATE1 2125 AESENC KEY STATE2 2126 AESENC KEY STATE3 2127 AESENC KEY STATE4 2128 movaps (TKEYP), KEY 2129 AESENC KEY STATE1 2130 AESENC KEY STATE2 2131 AESENC KEY STATE3 2132 AESENC KEY STATE4 2133 movaps 0x10(TKEYP), KEY 2134 AESENC KEY STATE1 2135 AESENC KEY STATE2 2136 AESENC KEY STATE3 2137 AESENC KEY STATE4 2138 movaps 0x20(TKEYP), KEY 2139 AESENC KEY STATE1 2140 AESENC KEY STATE2 2141 AESENC KEY STATE3 2142 AESENC KEY STATE4 2143 movaps 0x30(TKEYP), KEY 2144 AESENC KEY STATE1 2145 AESENC KEY STATE2 2146 AESENC KEY STATE3 2147 AESENC KEY STATE4 2148 movaps 0x40(TKEYP), KEY 2149 AESENC KEY STATE1 2150 AESENC KEY STATE2 2151 AESENC KEY STATE3 2152 AESENC KEY STATE4 2153 movaps 0x50(TKEYP), KEY 2154 AESENC KEY STATE1 2155 AESENC KEY STATE2 2156 AESENC KEY STATE3 2157 AESENC KEY STATE4 2158 movaps 0x60(TKEYP), KEY 2159 AESENC KEY STATE1 2160 AESENC KEY STATE2 2161 AESENC KEY STATE3 2162 AESENC KEY STATE4 2163 movaps 0x70(TKEYP), KEY 2164 AESENCLAST KEY STATE1 # last round 2165 AESENCLAST KEY STATE2 2166 AESENCLAST KEY STATE3 2167 AESENCLAST KEY STATE4 2168 ret 2169ENDPROC(_aesni_enc4) 2170 2171/* 2172 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2173 */ 2174ENTRY(aesni_dec) 2175 FRAME_BEGIN 2176#ifndef __x86_64__ 2177 pushl KEYP 2178 pushl KLEN 2179 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2180 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2181 movl (FRAME_OFFSET+20)(%esp), INP # src 2182#endif 2183 mov 480(KEYP), KLEN # key length 2184 add $240, KEYP 2185 movups (INP), STATE # input 2186 call _aesni_dec1 2187 movups STATE, (OUTP) #output 2188#ifndef __x86_64__ 2189 popl KLEN 2190 popl KEYP 2191#endif 2192 FRAME_END 2193 ret 2194ENDPROC(aesni_dec) 2195 2196/* 2197 * _aesni_dec1: internal ABI 2198 * input: 2199 * KEYP: key struct pointer 2200 * KLEN: key length 2201 * STATE: initial state (input) 2202 * output: 2203 * STATE: finial state (output) 2204 * changed: 2205 * KEY 2206 * TKEYP (T1) 2207 */ 2208.align 4 2209_aesni_dec1: 2210 movaps (KEYP), KEY # key 2211 mov KEYP, TKEYP 2212 pxor KEY, STATE # round 0 2213 add $0x30, TKEYP 2214 cmp $24, KLEN 2215 jb .Ldec128 2216 lea 0x20(TKEYP), TKEYP 2217 je .Ldec192 2218 add $0x20, TKEYP 2219 movaps -0x60(TKEYP), KEY 2220 AESDEC KEY STATE 2221 movaps -0x50(TKEYP), KEY 2222 AESDEC KEY STATE 2223.align 4 2224.Ldec192: 2225 movaps -0x40(TKEYP), KEY 2226 AESDEC KEY STATE 2227 movaps -0x30(TKEYP), KEY 2228 AESDEC KEY STATE 2229.align 4 2230.Ldec128: 2231 movaps -0x20(TKEYP), KEY 2232 AESDEC KEY STATE 2233 movaps -0x10(TKEYP), KEY 2234 AESDEC KEY STATE 2235 movaps (TKEYP), KEY 2236 AESDEC KEY STATE 2237 movaps 0x10(TKEYP), KEY 2238 AESDEC KEY STATE 2239 movaps 0x20(TKEYP), KEY 2240 AESDEC KEY STATE 2241 movaps 0x30(TKEYP), KEY 2242 AESDEC KEY STATE 2243 movaps 0x40(TKEYP), KEY 2244 AESDEC KEY STATE 2245 movaps 0x50(TKEYP), KEY 2246 AESDEC KEY STATE 2247 movaps 0x60(TKEYP), KEY 2248 AESDEC KEY STATE 2249 movaps 0x70(TKEYP), KEY 2250 AESDECLAST KEY STATE 2251 ret 2252ENDPROC(_aesni_dec1) 2253 2254/* 2255 * _aesni_dec4: internal ABI 2256 * input: 2257 * KEYP: key struct pointer 2258 * KLEN: key length 2259 * STATE1: initial state (input) 2260 * STATE2 2261 * STATE3 2262 * STATE4 2263 * output: 2264 * STATE1: finial state (output) 2265 * STATE2 2266 * STATE3 2267 * STATE4 2268 * changed: 2269 * KEY 2270 * TKEYP (T1) 2271 */ 2272.align 4 2273_aesni_dec4: 2274 movaps (KEYP), KEY # key 2275 mov KEYP, TKEYP 2276 pxor KEY, STATE1 # round 0 2277 pxor KEY, STATE2 2278 pxor KEY, STATE3 2279 pxor KEY, STATE4 2280 add $0x30, TKEYP 2281 cmp $24, KLEN 2282 jb .L4dec128 2283 lea 0x20(TKEYP), TKEYP 2284 je .L4dec192 2285 add $0x20, TKEYP 2286 movaps -0x60(TKEYP), KEY 2287 AESDEC KEY STATE1 2288 AESDEC KEY STATE2 2289 AESDEC KEY STATE3 2290 AESDEC KEY STATE4 2291 movaps -0x50(TKEYP), KEY 2292 AESDEC KEY STATE1 2293 AESDEC KEY STATE2 2294 AESDEC KEY STATE3 2295 AESDEC KEY STATE4 2296.align 4 2297.L4dec192: 2298 movaps -0x40(TKEYP), KEY 2299 AESDEC KEY STATE1 2300 AESDEC KEY STATE2 2301 AESDEC KEY STATE3 2302 AESDEC KEY STATE4 2303 movaps -0x30(TKEYP), KEY 2304 AESDEC KEY STATE1 2305 AESDEC KEY STATE2 2306 AESDEC KEY STATE3 2307 AESDEC KEY STATE4 2308.align 4 2309.L4dec128: 2310 movaps -0x20(TKEYP), KEY 2311 AESDEC KEY STATE1 2312 AESDEC KEY STATE2 2313 AESDEC KEY STATE3 2314 AESDEC KEY STATE4 2315 movaps -0x10(TKEYP), KEY 2316 AESDEC KEY STATE1 2317 AESDEC KEY STATE2 2318 AESDEC KEY STATE3 2319 AESDEC KEY STATE4 2320 movaps (TKEYP), KEY 2321 AESDEC KEY STATE1 2322 AESDEC KEY STATE2 2323 AESDEC KEY STATE3 2324 AESDEC KEY STATE4 2325 movaps 0x10(TKEYP), KEY 2326 AESDEC KEY STATE1 2327 AESDEC KEY STATE2 2328 AESDEC KEY STATE3 2329 AESDEC KEY STATE4 2330 movaps 0x20(TKEYP), KEY 2331 AESDEC KEY STATE1 2332 AESDEC KEY STATE2 2333 AESDEC KEY STATE3 2334 AESDEC KEY STATE4 2335 movaps 0x30(TKEYP), KEY 2336 AESDEC KEY STATE1 2337 AESDEC KEY STATE2 2338 AESDEC KEY STATE3 2339 AESDEC KEY STATE4 2340 movaps 0x40(TKEYP), KEY 2341 AESDEC KEY STATE1 2342 AESDEC KEY STATE2 2343 AESDEC KEY STATE3 2344 AESDEC KEY STATE4 2345 movaps 0x50(TKEYP), KEY 2346 AESDEC KEY STATE1 2347 AESDEC KEY STATE2 2348 AESDEC KEY STATE3 2349 AESDEC KEY STATE4 2350 movaps 0x60(TKEYP), KEY 2351 AESDEC KEY STATE1 2352 AESDEC KEY STATE2 2353 AESDEC KEY STATE3 2354 AESDEC KEY STATE4 2355 movaps 0x70(TKEYP), KEY 2356 AESDECLAST KEY STATE1 # last round 2357 AESDECLAST KEY STATE2 2358 AESDECLAST KEY STATE3 2359 AESDECLAST KEY STATE4 2360 ret 2361ENDPROC(_aesni_dec4) 2362 2363/* 2364 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2365 * size_t len) 2366 */ 2367ENTRY(aesni_ecb_enc) 2368 FRAME_BEGIN 2369#ifndef __x86_64__ 2370 pushl LEN 2371 pushl KEYP 2372 pushl KLEN 2373 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2374 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2375 movl (FRAME_OFFSET+24)(%esp), INP # src 2376 movl (FRAME_OFFSET+28)(%esp), LEN # len 2377#endif 2378 test LEN, LEN # check length 2379 jz .Lecb_enc_ret 2380 mov 480(KEYP), KLEN 2381 cmp $16, LEN 2382 jb .Lecb_enc_ret 2383 cmp $64, LEN 2384 jb .Lecb_enc_loop1 2385.align 4 2386.Lecb_enc_loop4: 2387 movups (INP), STATE1 2388 movups 0x10(INP), STATE2 2389 movups 0x20(INP), STATE3 2390 movups 0x30(INP), STATE4 2391 call _aesni_enc4 2392 movups STATE1, (OUTP) 2393 movups STATE2, 0x10(OUTP) 2394 movups STATE3, 0x20(OUTP) 2395 movups STATE4, 0x30(OUTP) 2396 sub $64, LEN 2397 add $64, INP 2398 add $64, OUTP 2399 cmp $64, LEN 2400 jge .Lecb_enc_loop4 2401 cmp $16, LEN 2402 jb .Lecb_enc_ret 2403.align 4 2404.Lecb_enc_loop1: 2405 movups (INP), STATE1 2406 call _aesni_enc1 2407 movups STATE1, (OUTP) 2408 sub $16, LEN 2409 add $16, INP 2410 add $16, OUTP 2411 cmp $16, LEN 2412 jge .Lecb_enc_loop1 2413.Lecb_enc_ret: 2414#ifndef __x86_64__ 2415 popl KLEN 2416 popl KEYP 2417 popl LEN 2418#endif 2419 FRAME_END 2420 ret 2421ENDPROC(aesni_ecb_enc) 2422 2423/* 2424 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2425 * size_t len); 2426 */ 2427ENTRY(aesni_ecb_dec) 2428 FRAME_BEGIN 2429#ifndef __x86_64__ 2430 pushl LEN 2431 pushl KEYP 2432 pushl KLEN 2433 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2434 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2435 movl (FRAME_OFFSET+24)(%esp), INP # src 2436 movl (FRAME_OFFSET+28)(%esp), LEN # len 2437#endif 2438 test LEN, LEN 2439 jz .Lecb_dec_ret 2440 mov 480(KEYP), KLEN 2441 add $240, KEYP 2442 cmp $16, LEN 2443 jb .Lecb_dec_ret 2444 cmp $64, LEN 2445 jb .Lecb_dec_loop1 2446.align 4 2447.Lecb_dec_loop4: 2448 movups (INP), STATE1 2449 movups 0x10(INP), STATE2 2450 movups 0x20(INP), STATE3 2451 movups 0x30(INP), STATE4 2452 call _aesni_dec4 2453 movups STATE1, (OUTP) 2454 movups STATE2, 0x10(OUTP) 2455 movups STATE3, 0x20(OUTP) 2456 movups STATE4, 0x30(OUTP) 2457 sub $64, LEN 2458 add $64, INP 2459 add $64, OUTP 2460 cmp $64, LEN 2461 jge .Lecb_dec_loop4 2462 cmp $16, LEN 2463 jb .Lecb_dec_ret 2464.align 4 2465.Lecb_dec_loop1: 2466 movups (INP), STATE1 2467 call _aesni_dec1 2468 movups STATE1, (OUTP) 2469 sub $16, LEN 2470 add $16, INP 2471 add $16, OUTP 2472 cmp $16, LEN 2473 jge .Lecb_dec_loop1 2474.Lecb_dec_ret: 2475#ifndef __x86_64__ 2476 popl KLEN 2477 popl KEYP 2478 popl LEN 2479#endif 2480 FRAME_END 2481 ret 2482ENDPROC(aesni_ecb_dec) 2483 2484/* 2485 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2486 * size_t len, u8 *iv) 2487 */ 2488ENTRY(aesni_cbc_enc) 2489 FRAME_BEGIN 2490#ifndef __x86_64__ 2491 pushl IVP 2492 pushl LEN 2493 pushl KEYP 2494 pushl KLEN 2495 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2496 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2497 movl (FRAME_OFFSET+28)(%esp), INP # src 2498 movl (FRAME_OFFSET+32)(%esp), LEN # len 2499 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2500#endif 2501 cmp $16, LEN 2502 jb .Lcbc_enc_ret 2503 mov 480(KEYP), KLEN 2504 movups (IVP), STATE # load iv as initial state 2505.align 4 2506.Lcbc_enc_loop: 2507 movups (INP), IN # load input 2508 pxor IN, STATE 2509 call _aesni_enc1 2510 movups STATE, (OUTP) # store output 2511 sub $16, LEN 2512 add $16, INP 2513 add $16, OUTP 2514 cmp $16, LEN 2515 jge .Lcbc_enc_loop 2516 movups STATE, (IVP) 2517.Lcbc_enc_ret: 2518#ifndef __x86_64__ 2519 popl KLEN 2520 popl KEYP 2521 popl LEN 2522 popl IVP 2523#endif 2524 FRAME_END 2525 ret 2526ENDPROC(aesni_cbc_enc) 2527 2528/* 2529 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2530 * size_t len, u8 *iv) 2531 */ 2532ENTRY(aesni_cbc_dec) 2533 FRAME_BEGIN 2534#ifndef __x86_64__ 2535 pushl IVP 2536 pushl LEN 2537 pushl KEYP 2538 pushl KLEN 2539 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2540 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2541 movl (FRAME_OFFSET+28)(%esp), INP # src 2542 movl (FRAME_OFFSET+32)(%esp), LEN # len 2543 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2544#endif 2545 cmp $16, LEN 2546 jb .Lcbc_dec_just_ret 2547 mov 480(KEYP), KLEN 2548 add $240, KEYP 2549 movups (IVP), IV 2550 cmp $64, LEN 2551 jb .Lcbc_dec_loop1 2552.align 4 2553.Lcbc_dec_loop4: 2554 movups (INP), IN1 2555 movaps IN1, STATE1 2556 movups 0x10(INP), IN2 2557 movaps IN2, STATE2 2558#ifdef __x86_64__ 2559 movups 0x20(INP), IN3 2560 movaps IN3, STATE3 2561 movups 0x30(INP), IN4 2562 movaps IN4, STATE4 2563#else 2564 movups 0x20(INP), IN1 2565 movaps IN1, STATE3 2566 movups 0x30(INP), IN2 2567 movaps IN2, STATE4 2568#endif 2569 call _aesni_dec4 2570 pxor IV, STATE1 2571#ifdef __x86_64__ 2572 pxor IN1, STATE2 2573 pxor IN2, STATE3 2574 pxor IN3, STATE4 2575 movaps IN4, IV 2576#else 2577 pxor IN1, STATE4 2578 movaps IN2, IV 2579 movups (INP), IN1 2580 pxor IN1, STATE2 2581 movups 0x10(INP), IN2 2582 pxor IN2, STATE3 2583#endif 2584 movups STATE1, (OUTP) 2585 movups STATE2, 0x10(OUTP) 2586 movups STATE3, 0x20(OUTP) 2587 movups STATE4, 0x30(OUTP) 2588 sub $64, LEN 2589 add $64, INP 2590 add $64, OUTP 2591 cmp $64, LEN 2592 jge .Lcbc_dec_loop4 2593 cmp $16, LEN 2594 jb .Lcbc_dec_ret 2595.align 4 2596.Lcbc_dec_loop1: 2597 movups (INP), IN 2598 movaps IN, STATE 2599 call _aesni_dec1 2600 pxor IV, STATE 2601 movups STATE, (OUTP) 2602 movaps IN, IV 2603 sub $16, LEN 2604 add $16, INP 2605 add $16, OUTP 2606 cmp $16, LEN 2607 jge .Lcbc_dec_loop1 2608.Lcbc_dec_ret: 2609 movups IV, (IVP) 2610.Lcbc_dec_just_ret: 2611#ifndef __x86_64__ 2612 popl KLEN 2613 popl KEYP 2614 popl LEN 2615 popl IVP 2616#endif 2617 FRAME_END 2618 ret 2619ENDPROC(aesni_cbc_dec) 2620 2621#ifdef __x86_64__ 2622.pushsection .rodata 2623.align 16 2624.Lbswap_mask: 2625 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2626.popsection 2627 2628/* 2629 * _aesni_inc_init: internal ABI 2630 * setup registers used by _aesni_inc 2631 * input: 2632 * IV 2633 * output: 2634 * CTR: == IV, in little endian 2635 * TCTR_LOW: == lower qword of CTR 2636 * INC: == 1, in little endian 2637 * BSWAP_MASK == endian swapping mask 2638 */ 2639.align 4 2640_aesni_inc_init: 2641 movaps .Lbswap_mask, BSWAP_MASK 2642 movaps IV, CTR 2643 PSHUFB_XMM BSWAP_MASK CTR 2644 mov $1, TCTR_LOW 2645 MOVQ_R64_XMM TCTR_LOW INC 2646 MOVQ_R64_XMM CTR TCTR_LOW 2647 ret 2648ENDPROC(_aesni_inc_init) 2649 2650/* 2651 * _aesni_inc: internal ABI 2652 * Increase IV by 1, IV is in big endian 2653 * input: 2654 * IV 2655 * CTR: == IV, in little endian 2656 * TCTR_LOW: == lower qword of CTR 2657 * INC: == 1, in little endian 2658 * BSWAP_MASK == endian swapping mask 2659 * output: 2660 * IV: Increase by 1 2661 * changed: 2662 * CTR: == output IV, in little endian 2663 * TCTR_LOW: == lower qword of CTR 2664 */ 2665.align 4 2666_aesni_inc: 2667 paddq INC, CTR 2668 add $1, TCTR_LOW 2669 jnc .Linc_low 2670 pslldq $8, INC 2671 paddq INC, CTR 2672 psrldq $8, INC 2673.Linc_low: 2674 movaps CTR, IV 2675 PSHUFB_XMM BSWAP_MASK IV 2676 ret 2677ENDPROC(_aesni_inc) 2678 2679/* 2680 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2681 * size_t len, u8 *iv) 2682 */ 2683ENTRY(aesni_ctr_enc) 2684 FRAME_BEGIN 2685 cmp $16, LEN 2686 jb .Lctr_enc_just_ret 2687 mov 480(KEYP), KLEN 2688 movups (IVP), IV 2689 call _aesni_inc_init 2690 cmp $64, LEN 2691 jb .Lctr_enc_loop1 2692.align 4 2693.Lctr_enc_loop4: 2694 movaps IV, STATE1 2695 call _aesni_inc 2696 movups (INP), IN1 2697 movaps IV, STATE2 2698 call _aesni_inc 2699 movups 0x10(INP), IN2 2700 movaps IV, STATE3 2701 call _aesni_inc 2702 movups 0x20(INP), IN3 2703 movaps IV, STATE4 2704 call _aesni_inc 2705 movups 0x30(INP), IN4 2706 call _aesni_enc4 2707 pxor IN1, STATE1 2708 movups STATE1, (OUTP) 2709 pxor IN2, STATE2 2710 movups STATE2, 0x10(OUTP) 2711 pxor IN3, STATE3 2712 movups STATE3, 0x20(OUTP) 2713 pxor IN4, STATE4 2714 movups STATE4, 0x30(OUTP) 2715 sub $64, LEN 2716 add $64, INP 2717 add $64, OUTP 2718 cmp $64, LEN 2719 jge .Lctr_enc_loop4 2720 cmp $16, LEN 2721 jb .Lctr_enc_ret 2722.align 4 2723.Lctr_enc_loop1: 2724 movaps IV, STATE 2725 call _aesni_inc 2726 movups (INP), IN 2727 call _aesni_enc1 2728 pxor IN, STATE 2729 movups STATE, (OUTP) 2730 sub $16, LEN 2731 add $16, INP 2732 add $16, OUTP 2733 cmp $16, LEN 2734 jge .Lctr_enc_loop1 2735.Lctr_enc_ret: 2736 movups IV, (IVP) 2737.Lctr_enc_just_ret: 2738 FRAME_END 2739 ret 2740ENDPROC(aesni_ctr_enc) 2741 2742/* 2743 * _aesni_gf128mul_x_ble: internal ABI 2744 * Multiply in GF(2^128) for XTS IVs 2745 * input: 2746 * IV: current IV 2747 * GF128MUL_MASK == mask with 0x87 and 0x01 2748 * output: 2749 * IV: next IV 2750 * changed: 2751 * CTR: == temporary value 2752 */ 2753#define _aesni_gf128mul_x_ble() \ 2754 pshufd $0x13, IV, CTR; \ 2755 paddq IV, IV; \ 2756 psrad $31, CTR; \ 2757 pand GF128MUL_MASK, CTR; \ 2758 pxor CTR, IV; 2759 2760/* 2761 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2762 * bool enc, u8 *iv) 2763 */ 2764ENTRY(aesni_xts_crypt8) 2765 FRAME_BEGIN 2766 cmpb $0, %cl 2767 movl $0, %ecx 2768 movl $240, %r10d 2769 leaq _aesni_enc4, %r11 2770 leaq _aesni_dec4, %rax 2771 cmovel %r10d, %ecx 2772 cmoveq %rax, %r11 2773 2774 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2775 movups (IVP), IV 2776 2777 mov 480(KEYP), KLEN 2778 addq %rcx, KEYP 2779 2780 movdqa IV, STATE1 2781 movdqu 0x00(INP), INC 2782 pxor INC, STATE1 2783 movdqu IV, 0x00(OUTP) 2784 2785 _aesni_gf128mul_x_ble() 2786 movdqa IV, STATE2 2787 movdqu 0x10(INP), INC 2788 pxor INC, STATE2 2789 movdqu IV, 0x10(OUTP) 2790 2791 _aesni_gf128mul_x_ble() 2792 movdqa IV, STATE3 2793 movdqu 0x20(INP), INC 2794 pxor INC, STATE3 2795 movdqu IV, 0x20(OUTP) 2796 2797 _aesni_gf128mul_x_ble() 2798 movdqa IV, STATE4 2799 movdqu 0x30(INP), INC 2800 pxor INC, STATE4 2801 movdqu IV, 0x30(OUTP) 2802 2803 CALL_NOSPEC %r11 2804 2805 movdqu 0x00(OUTP), INC 2806 pxor INC, STATE1 2807 movdqu STATE1, 0x00(OUTP) 2808 2809 _aesni_gf128mul_x_ble() 2810 movdqa IV, STATE1 2811 movdqu 0x40(INP), INC 2812 pxor INC, STATE1 2813 movdqu IV, 0x40(OUTP) 2814 2815 movdqu 0x10(OUTP), INC 2816 pxor INC, STATE2 2817 movdqu STATE2, 0x10(OUTP) 2818 2819 _aesni_gf128mul_x_ble() 2820 movdqa IV, STATE2 2821 movdqu 0x50(INP), INC 2822 pxor INC, STATE2 2823 movdqu IV, 0x50(OUTP) 2824 2825 movdqu 0x20(OUTP), INC 2826 pxor INC, STATE3 2827 movdqu STATE3, 0x20(OUTP) 2828 2829 _aesni_gf128mul_x_ble() 2830 movdqa IV, STATE3 2831 movdqu 0x60(INP), INC 2832 pxor INC, STATE3 2833 movdqu IV, 0x60(OUTP) 2834 2835 movdqu 0x30(OUTP), INC 2836 pxor INC, STATE4 2837 movdqu STATE4, 0x30(OUTP) 2838 2839 _aesni_gf128mul_x_ble() 2840 movdqa IV, STATE4 2841 movdqu 0x70(INP), INC 2842 pxor INC, STATE4 2843 movdqu IV, 0x70(OUTP) 2844 2845 _aesni_gf128mul_x_ble() 2846 movups IV, (IVP) 2847 2848 CALL_NOSPEC %r11 2849 2850 movdqu 0x40(OUTP), INC 2851 pxor INC, STATE1 2852 movdqu STATE1, 0x40(OUTP) 2853 2854 movdqu 0x50(OUTP), INC 2855 pxor INC, STATE2 2856 movdqu STATE2, 0x50(OUTP) 2857 2858 movdqu 0x60(OUTP), INC 2859 pxor INC, STATE3 2860 movdqu STATE3, 0x60(OUTP) 2861 2862 movdqu 0x70(OUTP), INC 2863 pxor INC, STATE4 2864 movdqu STATE4, 0x70(OUTP) 2865 2866 FRAME_END 2867 ret 2868ENDPROC(aesni_xts_crypt8) 2869 2870#endif 2871