1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/frame.h> 35 36/* 37 * The following macros are used to move an (un)aligned 16 byte value to/from 38 * an XMM register. This can done for either FP or integer values, for FP use 39 * movaps (move aligned packed single) or integer use movdqa (move double quad 40 * aligned). It doesn't make a performance difference which instruction is used 41 * since Nehalem (original Core i7) was released. However, the movaps is a byte 42 * shorter, so that is the one we'll use for now. (same for unaligned). 43 */ 44#define MOVADQ movaps 45#define MOVUDQ movups 46 47#ifdef __x86_64__ 48 49# constants in mergeable sections, linker can reorder and merge 50.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 51.align 16 52.Lgf128mul_x_ble_mask: 53 .octa 0x00000000000000010000000000000087 54.section .rodata.cst16.POLY, "aM", @progbits, 16 55.align 16 56POLY: .octa 0xC2000000000000000000000000000001 57.section .rodata.cst16.TWOONE, "aM", @progbits, 16 58.align 16 59TWOONE: .octa 0x00000001000000000000000000000001 60 61.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 62.align 16 63SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 64.section .rodata.cst16.MASK1, "aM", @progbits, 16 65.align 16 66MASK1: .octa 0x0000000000000000ffffffffffffffff 67.section .rodata.cst16.MASK2, "aM", @progbits, 16 68.align 16 69MASK2: .octa 0xffffffffffffffff0000000000000000 70.section .rodata.cst16.ONE, "aM", @progbits, 16 71.align 16 72ONE: .octa 0x00000000000000000000000000000001 73.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 74.align 16 75F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 76.section .rodata.cst16.dec, "aM", @progbits, 16 77.align 16 78dec: .octa 0x1 79.section .rodata.cst16.enc, "aM", @progbits, 16 80.align 16 81enc: .octa 0x2 82 83# order of these constants should not change. 84# more specifically, ALL_F should follow SHIFT_MASK, 85# and zero should follow ALL_F 86.section .rodata, "a", @progbits 87.align 16 88SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 89ALL_F: .octa 0xffffffffffffffffffffffffffffffff 90 .octa 0x00000000000000000000000000000000 91 92.section .rodata 93.align 16 94.type aad_shift_arr, @object 95.size aad_shift_arr, 272 96aad_shift_arr: 97 .octa 0xffffffffffffffffffffffffffffffff 98 .octa 0xffffffffffffffffffffffffffffff0C 99 .octa 0xffffffffffffffffffffffffffff0D0C 100 .octa 0xffffffffffffffffffffffffff0E0D0C 101 .octa 0xffffffffffffffffffffffff0F0E0D0C 102 .octa 0xffffffffffffffffffffff0C0B0A0908 103 .octa 0xffffffffffffffffffff0D0C0B0A0908 104 .octa 0xffffffffffffffffff0E0D0C0B0A0908 105 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 106 .octa 0xffffffffffffff0C0B0A090807060504 107 .octa 0xffffffffffff0D0C0B0A090807060504 108 .octa 0xffffffffff0E0D0C0B0A090807060504 109 .octa 0xffffffff0F0E0D0C0B0A090807060504 110 .octa 0xffffff0C0B0A09080706050403020100 111 .octa 0xffff0D0C0B0A09080706050403020100 112 .octa 0xff0E0D0C0B0A09080706050403020100 113 .octa 0x0F0E0D0C0B0A09080706050403020100 114 115 116.text 117 118 119#define STACK_OFFSET 8*3 120#define HashKey 16*0 // store HashKey <<1 mod poly here 121#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 122#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 123#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 124#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 125 // bits of HashKey <<1 mod poly here 126 //(for Karatsuba purposes) 127#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 128 // bits of HashKey^2 <<1 mod poly here 129 // (for Karatsuba purposes) 130#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 131 // bits of HashKey^3 <<1 mod poly here 132 // (for Karatsuba purposes) 133#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 134 // bits of HashKey^4 <<1 mod poly here 135 // (for Karatsuba purposes) 136#define VARIABLE_OFFSET 16*8 137 138#define arg1 rdi 139#define arg2 rsi 140#define arg3 rdx 141#define arg4 rcx 142#define arg5 r8 143#define arg6 r9 144#define arg7 STACK_OFFSET+8(%r14) 145#define arg8 STACK_OFFSET+16(%r14) 146#define arg9 STACK_OFFSET+24(%r14) 147#define arg10 STACK_OFFSET+32(%r14) 148#define keysize 2*15*16(%arg1) 149#endif 150 151 152#define STATE1 %xmm0 153#define STATE2 %xmm4 154#define STATE3 %xmm5 155#define STATE4 %xmm6 156#define STATE STATE1 157#define IN1 %xmm1 158#define IN2 %xmm7 159#define IN3 %xmm8 160#define IN4 %xmm9 161#define IN IN1 162#define KEY %xmm2 163#define IV %xmm3 164 165#define BSWAP_MASK %xmm10 166#define CTR %xmm11 167#define INC %xmm12 168 169#define GF128MUL_MASK %xmm10 170 171#ifdef __x86_64__ 172#define AREG %rax 173#define KEYP %rdi 174#define OUTP %rsi 175#define UKEYP OUTP 176#define INP %rdx 177#define LEN %rcx 178#define IVP %r8 179#define KLEN %r9d 180#define T1 %r10 181#define TKEYP T1 182#define T2 %r11 183#define TCTR_LOW T2 184#else 185#define AREG %eax 186#define KEYP %edi 187#define OUTP AREG 188#define UKEYP OUTP 189#define INP %edx 190#define LEN %esi 191#define IVP %ebp 192#define KLEN %ebx 193#define T1 %ecx 194#define TKEYP T1 195#endif 196 197 198#ifdef __x86_64__ 199/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 200* 201* 202* Input: A and B (128-bits each, bit-reflected) 203* Output: C = A*B*x mod poly, (i.e. >>1 ) 204* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 205* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 206* 207*/ 208.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 209 movdqa \GH, \TMP1 210 pshufd $78, \GH, \TMP2 211 pshufd $78, \HK, \TMP3 212 pxor \GH, \TMP2 # TMP2 = a1+a0 213 pxor \HK, \TMP3 # TMP3 = b1+b0 214 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 215 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 216 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 217 pxor \GH, \TMP2 218 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 219 movdqa \TMP2, \TMP3 220 pslldq $8, \TMP3 # left shift TMP3 2 DWs 221 psrldq $8, \TMP2 # right shift TMP2 2 DWs 222 pxor \TMP3, \GH 223 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 224 225 # first phase of the reduction 226 227 movdqa \GH, \TMP2 228 movdqa \GH, \TMP3 229 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 230 # in in order to perform 231 # independent shifts 232 pslld $31, \TMP2 # packed right shift <<31 233 pslld $30, \TMP3 # packed right shift <<30 234 pslld $25, \TMP4 # packed right shift <<25 235 pxor \TMP3, \TMP2 # xor the shifted versions 236 pxor \TMP4, \TMP2 237 movdqa \TMP2, \TMP5 238 psrldq $4, \TMP5 # right shift TMP5 1 DW 239 pslldq $12, \TMP2 # left shift TMP2 3 DWs 240 pxor \TMP2, \GH 241 242 # second phase of the reduction 243 244 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 245 # in in order to perform 246 # independent shifts 247 movdqa \GH,\TMP3 248 movdqa \GH,\TMP4 249 psrld $1,\TMP2 # packed left shift >>1 250 psrld $2,\TMP3 # packed left shift >>2 251 psrld $7,\TMP4 # packed left shift >>7 252 pxor \TMP3,\TMP2 # xor the shifted versions 253 pxor \TMP4,\TMP2 254 pxor \TMP5, \TMP2 255 pxor \TMP2, \GH 256 pxor \TMP1, \GH # result is in TMP1 257.endm 258 259/* 260* if a = number of total plaintext bytes 261* b = floor(a/16) 262* num_initial_blocks = b mod 4 263* encrypt the initial num_initial_blocks blocks and apply ghash on 264* the ciphertext 265* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 266* are clobbered 267* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 268*/ 269 270 271.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 272XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 273 MOVADQ SHUF_MASK(%rip), %xmm14 274 mov arg7, %r10 # %r10 = AAD 275 mov arg8, %r12 # %r12 = aadLen 276 mov %r12, %r11 277 pxor %xmm\i, %xmm\i 278 pxor \XMM2, \XMM2 279 280 cmp $16, %r11 281 jl _get_AAD_rest8\num_initial_blocks\operation 282_get_AAD_blocks\num_initial_blocks\operation: 283 movdqu (%r10), %xmm\i 284 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 285 pxor %xmm\i, \XMM2 286 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 287 add $16, %r10 288 sub $16, %r12 289 sub $16, %r11 290 cmp $16, %r11 291 jge _get_AAD_blocks\num_initial_blocks\operation 292 293 movdqu \XMM2, %xmm\i 294 cmp $0, %r11 295 je _get_AAD_done\num_initial_blocks\operation 296 297 pxor %xmm\i,%xmm\i 298 299 /* read the last <16B of AAD. since we have at least 4B of 300 data right after the AAD (the ICV, and maybe some CT), we can 301 read 4B/8B blocks safely, and then get rid of the extra stuff */ 302_get_AAD_rest8\num_initial_blocks\operation: 303 cmp $4, %r11 304 jle _get_AAD_rest4\num_initial_blocks\operation 305 movq (%r10), \TMP1 306 add $8, %r10 307 sub $8, %r11 308 pslldq $8, \TMP1 309 psrldq $8, %xmm\i 310 pxor \TMP1, %xmm\i 311 jmp _get_AAD_rest8\num_initial_blocks\operation 312_get_AAD_rest4\num_initial_blocks\operation: 313 cmp $0, %r11 314 jle _get_AAD_rest0\num_initial_blocks\operation 315 mov (%r10), %eax 316 movq %rax, \TMP1 317 add $4, %r10 318 sub $4, %r10 319 pslldq $12, \TMP1 320 psrldq $4, %xmm\i 321 pxor \TMP1, %xmm\i 322_get_AAD_rest0\num_initial_blocks\operation: 323 /* finalize: shift out the extra bytes we read, and align 324 left. since pslldq can only shift by an immediate, we use 325 vpshufb and an array of shuffle masks */ 326 movq %r12, %r11 327 salq $4, %r11 328 movdqu aad_shift_arr(%r11), \TMP1 329 PSHUFB_XMM \TMP1, %xmm\i 330_get_AAD_rest_final\num_initial_blocks\operation: 331 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 332 pxor \XMM2, %xmm\i 333 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 334 335_get_AAD_done\num_initial_blocks\operation: 336 xor %r11, %r11 # initialise the data pointer offset as zero 337 # start AES for num_initial_blocks blocks 338 339 mov %arg5, %rax # %rax = *Y0 340 movdqu (%rax), \XMM0 # XMM0 = Y0 341 PSHUFB_XMM %xmm14, \XMM0 342 343.if (\i == 5) || (\i == 6) || (\i == 7) 344 MOVADQ ONE(%RIP),\TMP1 345 MOVADQ (%arg1),\TMP2 346.irpc index, \i_seq 347 paddd \TMP1, \XMM0 # INCR Y0 348 movdqa \XMM0, %xmm\index 349 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 350 pxor \TMP2, %xmm\index 351.endr 352 lea 0x10(%arg1),%r10 353 mov keysize,%eax 354 shr $2,%eax # 128->4, 192->6, 256->8 355 add $5,%eax # 128->9, 192->11, 256->13 356 357aes_loop_initial_dec\num_initial_blocks: 358 MOVADQ (%r10),\TMP1 359.irpc index, \i_seq 360 AESENC \TMP1, %xmm\index 361.endr 362 add $16,%r10 363 sub $1,%eax 364 jnz aes_loop_initial_dec\num_initial_blocks 365 366 MOVADQ (%r10), \TMP1 367.irpc index, \i_seq 368 AESENCLAST \TMP1, %xmm\index # Last Round 369.endr 370.irpc index, \i_seq 371 movdqu (%arg3 , %r11, 1), \TMP1 372 pxor \TMP1, %xmm\index 373 movdqu %xmm\index, (%arg2 , %r11, 1) 374 # write back plaintext/ciphertext for num_initial_blocks 375 add $16, %r11 376 377 movdqa \TMP1, %xmm\index 378 PSHUFB_XMM %xmm14, %xmm\index 379 # prepare plaintext/ciphertext for GHASH computation 380.endr 381.endif 382 383 # apply GHASH on num_initial_blocks blocks 384 385.if \i == 5 386 pxor %xmm5, %xmm6 387 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 388 pxor %xmm6, %xmm7 389 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 390 pxor %xmm7, %xmm8 391 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 392.elseif \i == 6 393 pxor %xmm6, %xmm7 394 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 395 pxor %xmm7, %xmm8 396 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 397.elseif \i == 7 398 pxor %xmm7, %xmm8 399 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 400.endif 401 cmp $64, %r13 402 jl _initial_blocks_done\num_initial_blocks\operation 403 # no need for precomputed values 404/* 405* 406* Precomputations for HashKey parallel with encryption of first 4 blocks. 407* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 408*/ 409 MOVADQ ONE(%rip), \TMP1 410 paddd \TMP1, \XMM0 # INCR Y0 411 MOVADQ \XMM0, \XMM1 412 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 413 414 paddd \TMP1, \XMM0 # INCR Y0 415 MOVADQ \XMM0, \XMM2 416 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 417 418 paddd \TMP1, \XMM0 # INCR Y0 419 MOVADQ \XMM0, \XMM3 420 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 421 422 paddd \TMP1, \XMM0 # INCR Y0 423 MOVADQ \XMM0, \XMM4 424 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 425 426 MOVADQ 0(%arg1),\TMP1 427 pxor \TMP1, \XMM1 428 pxor \TMP1, \XMM2 429 pxor \TMP1, \XMM3 430 pxor \TMP1, \XMM4 431 movdqa \TMP3, \TMP5 432 pshufd $78, \TMP3, \TMP1 433 pxor \TMP3, \TMP1 434 movdqa \TMP1, HashKey_k(%rsp) 435 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 436# TMP5 = HashKey^2<<1 (mod poly) 437 movdqa \TMP5, HashKey_2(%rsp) 438# HashKey_2 = HashKey^2<<1 (mod poly) 439 pshufd $78, \TMP5, \TMP1 440 pxor \TMP5, \TMP1 441 movdqa \TMP1, HashKey_2_k(%rsp) 442.irpc index, 1234 # do 4 rounds 443 movaps 0x10*\index(%arg1), \TMP1 444 AESENC \TMP1, \XMM1 445 AESENC \TMP1, \XMM2 446 AESENC \TMP1, \XMM3 447 AESENC \TMP1, \XMM4 448.endr 449 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 450# TMP5 = HashKey^3<<1 (mod poly) 451 movdqa \TMP5, HashKey_3(%rsp) 452 pshufd $78, \TMP5, \TMP1 453 pxor \TMP5, \TMP1 454 movdqa \TMP1, HashKey_3_k(%rsp) 455.irpc index, 56789 # do next 5 rounds 456 movaps 0x10*\index(%arg1), \TMP1 457 AESENC \TMP1, \XMM1 458 AESENC \TMP1, \XMM2 459 AESENC \TMP1, \XMM3 460 AESENC \TMP1, \XMM4 461.endr 462 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 463# TMP5 = HashKey^3<<1 (mod poly) 464 movdqa \TMP5, HashKey_4(%rsp) 465 pshufd $78, \TMP5, \TMP1 466 pxor \TMP5, \TMP1 467 movdqa \TMP1, HashKey_4_k(%rsp) 468 lea 0xa0(%arg1),%r10 469 mov keysize,%eax 470 shr $2,%eax # 128->4, 192->6, 256->8 471 sub $4,%eax # 128->0, 192->2, 256->4 472 jz aes_loop_pre_dec_done\num_initial_blocks 473 474aes_loop_pre_dec\num_initial_blocks: 475 MOVADQ (%r10),\TMP2 476.irpc index, 1234 477 AESENC \TMP2, %xmm\index 478.endr 479 add $16,%r10 480 sub $1,%eax 481 jnz aes_loop_pre_dec\num_initial_blocks 482 483aes_loop_pre_dec_done\num_initial_blocks: 484 MOVADQ (%r10), \TMP2 485 AESENCLAST \TMP2, \XMM1 486 AESENCLAST \TMP2, \XMM2 487 AESENCLAST \TMP2, \XMM3 488 AESENCLAST \TMP2, \XMM4 489 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 490 pxor \TMP1, \XMM1 491 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 492 movdqa \TMP1, \XMM1 493 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 494 pxor \TMP1, \XMM2 495 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 496 movdqa \TMP1, \XMM2 497 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 498 pxor \TMP1, \XMM3 499 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 500 movdqa \TMP1, \XMM3 501 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 502 pxor \TMP1, \XMM4 503 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 504 movdqa \TMP1, \XMM4 505 add $64, %r11 506 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 507 pxor \XMMDst, \XMM1 508# combine GHASHed value with the corresponding ciphertext 509 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 510 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 511 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 512 513_initial_blocks_done\num_initial_blocks\operation: 514 515.endm 516 517 518/* 519* if a = number of total plaintext bytes 520* b = floor(a/16) 521* num_initial_blocks = b mod 4 522* encrypt the initial num_initial_blocks blocks and apply ghash on 523* the ciphertext 524* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 525* are clobbered 526* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 527*/ 528 529 530.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 531XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 532 MOVADQ SHUF_MASK(%rip), %xmm14 533 mov arg7, %r10 # %r10 = AAD 534 mov arg8, %r12 # %r12 = aadLen 535 mov %r12, %r11 536 pxor %xmm\i, %xmm\i 537 pxor \XMM2, \XMM2 538 539 cmp $16, %r11 540 jl _get_AAD_rest8\num_initial_blocks\operation 541_get_AAD_blocks\num_initial_blocks\operation: 542 movdqu (%r10), %xmm\i 543 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 544 pxor %xmm\i, \XMM2 545 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 546 add $16, %r10 547 sub $16, %r12 548 sub $16, %r11 549 cmp $16, %r11 550 jge _get_AAD_blocks\num_initial_blocks\operation 551 552 movdqu \XMM2, %xmm\i 553 cmp $0, %r11 554 je _get_AAD_done\num_initial_blocks\operation 555 556 pxor %xmm\i,%xmm\i 557 558 /* read the last <16B of AAD. since we have at least 4B of 559 data right after the AAD (the ICV, and maybe some PT), we can 560 read 4B/8B blocks safely, and then get rid of the extra stuff */ 561_get_AAD_rest8\num_initial_blocks\operation: 562 cmp $4, %r11 563 jle _get_AAD_rest4\num_initial_blocks\operation 564 movq (%r10), \TMP1 565 add $8, %r10 566 sub $8, %r11 567 pslldq $8, \TMP1 568 psrldq $8, %xmm\i 569 pxor \TMP1, %xmm\i 570 jmp _get_AAD_rest8\num_initial_blocks\operation 571_get_AAD_rest4\num_initial_blocks\operation: 572 cmp $0, %r11 573 jle _get_AAD_rest0\num_initial_blocks\operation 574 mov (%r10), %eax 575 movq %rax, \TMP1 576 add $4, %r10 577 sub $4, %r10 578 pslldq $12, \TMP1 579 psrldq $4, %xmm\i 580 pxor \TMP1, %xmm\i 581_get_AAD_rest0\num_initial_blocks\operation: 582 /* finalize: shift out the extra bytes we read, and align 583 left. since pslldq can only shift by an immediate, we use 584 vpshufb and an array of shuffle masks */ 585 movq %r12, %r11 586 salq $4, %r11 587 movdqu aad_shift_arr(%r11), \TMP1 588 PSHUFB_XMM \TMP1, %xmm\i 589_get_AAD_rest_final\num_initial_blocks\operation: 590 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 591 pxor \XMM2, %xmm\i 592 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 593 594_get_AAD_done\num_initial_blocks\operation: 595 xor %r11, %r11 # initialise the data pointer offset as zero 596 # start AES for num_initial_blocks blocks 597 598 mov %arg5, %rax # %rax = *Y0 599 movdqu (%rax), \XMM0 # XMM0 = Y0 600 PSHUFB_XMM %xmm14, \XMM0 601 602.if (\i == 5) || (\i == 6) || (\i == 7) 603 604 MOVADQ ONE(%RIP),\TMP1 605 MOVADQ 0(%arg1),\TMP2 606.irpc index, \i_seq 607 paddd \TMP1, \XMM0 # INCR Y0 608 MOVADQ \XMM0, %xmm\index 609 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 610 pxor \TMP2, %xmm\index 611.endr 612 lea 0x10(%arg1),%r10 613 mov keysize,%eax 614 shr $2,%eax # 128->4, 192->6, 256->8 615 add $5,%eax # 128->9, 192->11, 256->13 616 617aes_loop_initial_enc\num_initial_blocks: 618 MOVADQ (%r10),\TMP1 619.irpc index, \i_seq 620 AESENC \TMP1, %xmm\index 621.endr 622 add $16,%r10 623 sub $1,%eax 624 jnz aes_loop_initial_enc\num_initial_blocks 625 626 MOVADQ (%r10), \TMP1 627.irpc index, \i_seq 628 AESENCLAST \TMP1, %xmm\index # Last Round 629.endr 630.irpc index, \i_seq 631 movdqu (%arg3 , %r11, 1), \TMP1 632 pxor \TMP1, %xmm\index 633 movdqu %xmm\index, (%arg2 , %r11, 1) 634 # write back plaintext/ciphertext for num_initial_blocks 635 add $16, %r11 636 PSHUFB_XMM %xmm14, %xmm\index 637 638 # prepare plaintext/ciphertext for GHASH computation 639.endr 640.endif 641 642 # apply GHASH on num_initial_blocks blocks 643 644.if \i == 5 645 pxor %xmm5, %xmm6 646 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 647 pxor %xmm6, %xmm7 648 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 649 pxor %xmm7, %xmm8 650 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 651.elseif \i == 6 652 pxor %xmm6, %xmm7 653 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 654 pxor %xmm7, %xmm8 655 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 656.elseif \i == 7 657 pxor %xmm7, %xmm8 658 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 659.endif 660 cmp $64, %r13 661 jl _initial_blocks_done\num_initial_blocks\operation 662 # no need for precomputed values 663/* 664* 665* Precomputations for HashKey parallel with encryption of first 4 blocks. 666* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 667*/ 668 MOVADQ ONE(%RIP),\TMP1 669 paddd \TMP1, \XMM0 # INCR Y0 670 MOVADQ \XMM0, \XMM1 671 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 672 673 paddd \TMP1, \XMM0 # INCR Y0 674 MOVADQ \XMM0, \XMM2 675 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 676 677 paddd \TMP1, \XMM0 # INCR Y0 678 MOVADQ \XMM0, \XMM3 679 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 680 681 paddd \TMP1, \XMM0 # INCR Y0 682 MOVADQ \XMM0, \XMM4 683 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 684 685 MOVADQ 0(%arg1),\TMP1 686 pxor \TMP1, \XMM1 687 pxor \TMP1, \XMM2 688 pxor \TMP1, \XMM3 689 pxor \TMP1, \XMM4 690 movdqa \TMP3, \TMP5 691 pshufd $78, \TMP3, \TMP1 692 pxor \TMP3, \TMP1 693 movdqa \TMP1, HashKey_k(%rsp) 694 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 695# TMP5 = HashKey^2<<1 (mod poly) 696 movdqa \TMP5, HashKey_2(%rsp) 697# HashKey_2 = HashKey^2<<1 (mod poly) 698 pshufd $78, \TMP5, \TMP1 699 pxor \TMP5, \TMP1 700 movdqa \TMP1, HashKey_2_k(%rsp) 701.irpc index, 1234 # do 4 rounds 702 movaps 0x10*\index(%arg1), \TMP1 703 AESENC \TMP1, \XMM1 704 AESENC \TMP1, \XMM2 705 AESENC \TMP1, \XMM3 706 AESENC \TMP1, \XMM4 707.endr 708 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 709# TMP5 = HashKey^3<<1 (mod poly) 710 movdqa \TMP5, HashKey_3(%rsp) 711 pshufd $78, \TMP5, \TMP1 712 pxor \TMP5, \TMP1 713 movdqa \TMP1, HashKey_3_k(%rsp) 714.irpc index, 56789 # do next 5 rounds 715 movaps 0x10*\index(%arg1), \TMP1 716 AESENC \TMP1, \XMM1 717 AESENC \TMP1, \XMM2 718 AESENC \TMP1, \XMM3 719 AESENC \TMP1, \XMM4 720.endr 721 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 722# TMP5 = HashKey^3<<1 (mod poly) 723 movdqa \TMP5, HashKey_4(%rsp) 724 pshufd $78, \TMP5, \TMP1 725 pxor \TMP5, \TMP1 726 movdqa \TMP1, HashKey_4_k(%rsp) 727 lea 0xa0(%arg1),%r10 728 mov keysize,%eax 729 shr $2,%eax # 128->4, 192->6, 256->8 730 sub $4,%eax # 128->0, 192->2, 256->4 731 jz aes_loop_pre_enc_done\num_initial_blocks 732 733aes_loop_pre_enc\num_initial_blocks: 734 MOVADQ (%r10),\TMP2 735.irpc index, 1234 736 AESENC \TMP2, %xmm\index 737.endr 738 add $16,%r10 739 sub $1,%eax 740 jnz aes_loop_pre_enc\num_initial_blocks 741 742aes_loop_pre_enc_done\num_initial_blocks: 743 MOVADQ (%r10), \TMP2 744 AESENCLAST \TMP2, \XMM1 745 AESENCLAST \TMP2, \XMM2 746 AESENCLAST \TMP2, \XMM3 747 AESENCLAST \TMP2, \XMM4 748 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 749 pxor \TMP1, \XMM1 750 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 751 pxor \TMP1, \XMM2 752 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 753 pxor \TMP1, \XMM3 754 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 755 pxor \TMP1, \XMM4 756 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 757 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 758 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 759 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 760 761 add $64, %r11 762 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 763 pxor \XMMDst, \XMM1 764# combine GHASHed value with the corresponding ciphertext 765 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 766 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 767 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 768 769_initial_blocks_done\num_initial_blocks\operation: 770 771.endm 772 773/* 774* encrypt 4 blocks at a time 775* ghash the 4 previously encrypted ciphertext blocks 776* arg1, %arg2, %arg3 are used as pointers only, not modified 777* %r11 is the data offset value 778*/ 779.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 780TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 781 782 movdqa \XMM1, \XMM5 783 movdqa \XMM2, \XMM6 784 movdqa \XMM3, \XMM7 785 movdqa \XMM4, \XMM8 786 787 movdqa SHUF_MASK(%rip), %xmm15 788 # multiply TMP5 * HashKey using karatsuba 789 790 movdqa \XMM5, \TMP4 791 pshufd $78, \XMM5, \TMP6 792 pxor \XMM5, \TMP6 793 paddd ONE(%rip), \XMM0 # INCR CNT 794 movdqa HashKey_4(%rsp), \TMP5 795 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 796 movdqa \XMM0, \XMM1 797 paddd ONE(%rip), \XMM0 # INCR CNT 798 movdqa \XMM0, \XMM2 799 paddd ONE(%rip), \XMM0 # INCR CNT 800 movdqa \XMM0, \XMM3 801 paddd ONE(%rip), \XMM0 # INCR CNT 802 movdqa \XMM0, \XMM4 803 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 804 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 805 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 806 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 807 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 808 809 pxor (%arg1), \XMM1 810 pxor (%arg1), \XMM2 811 pxor (%arg1), \XMM3 812 pxor (%arg1), \XMM4 813 movdqa HashKey_4_k(%rsp), \TMP5 814 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 815 movaps 0x10(%arg1), \TMP1 816 AESENC \TMP1, \XMM1 # Round 1 817 AESENC \TMP1, \XMM2 818 AESENC \TMP1, \XMM3 819 AESENC \TMP1, \XMM4 820 movaps 0x20(%arg1), \TMP1 821 AESENC \TMP1, \XMM1 # Round 2 822 AESENC \TMP1, \XMM2 823 AESENC \TMP1, \XMM3 824 AESENC \TMP1, \XMM4 825 movdqa \XMM6, \TMP1 826 pshufd $78, \XMM6, \TMP2 827 pxor \XMM6, \TMP2 828 movdqa HashKey_3(%rsp), \TMP5 829 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 830 movaps 0x30(%arg1), \TMP3 831 AESENC \TMP3, \XMM1 # Round 3 832 AESENC \TMP3, \XMM2 833 AESENC \TMP3, \XMM3 834 AESENC \TMP3, \XMM4 835 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 836 movaps 0x40(%arg1), \TMP3 837 AESENC \TMP3, \XMM1 # Round 4 838 AESENC \TMP3, \XMM2 839 AESENC \TMP3, \XMM3 840 AESENC \TMP3, \XMM4 841 movdqa HashKey_3_k(%rsp), \TMP5 842 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 843 movaps 0x50(%arg1), \TMP3 844 AESENC \TMP3, \XMM1 # Round 5 845 AESENC \TMP3, \XMM2 846 AESENC \TMP3, \XMM3 847 AESENC \TMP3, \XMM4 848 pxor \TMP1, \TMP4 849# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 850 pxor \XMM6, \XMM5 851 pxor \TMP2, \TMP6 852 movdqa \XMM7, \TMP1 853 pshufd $78, \XMM7, \TMP2 854 pxor \XMM7, \TMP2 855 movdqa HashKey_2(%rsp ), \TMP5 856 857 # Multiply TMP5 * HashKey using karatsuba 858 859 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 860 movaps 0x60(%arg1), \TMP3 861 AESENC \TMP3, \XMM1 # Round 6 862 AESENC \TMP3, \XMM2 863 AESENC \TMP3, \XMM3 864 AESENC \TMP3, \XMM4 865 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 866 movaps 0x70(%arg1), \TMP3 867 AESENC \TMP3, \XMM1 # Round 7 868 AESENC \TMP3, \XMM2 869 AESENC \TMP3, \XMM3 870 AESENC \TMP3, \XMM4 871 movdqa HashKey_2_k(%rsp), \TMP5 872 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 873 movaps 0x80(%arg1), \TMP3 874 AESENC \TMP3, \XMM1 # Round 8 875 AESENC \TMP3, \XMM2 876 AESENC \TMP3, \XMM3 877 AESENC \TMP3, \XMM4 878 pxor \TMP1, \TMP4 879# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 880 pxor \XMM7, \XMM5 881 pxor \TMP2, \TMP6 882 883 # Multiply XMM8 * HashKey 884 # XMM8 and TMP5 hold the values for the two operands 885 886 movdqa \XMM8, \TMP1 887 pshufd $78, \XMM8, \TMP2 888 pxor \XMM8, \TMP2 889 movdqa HashKey(%rsp), \TMP5 890 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 891 movaps 0x90(%arg1), \TMP3 892 AESENC \TMP3, \XMM1 # Round 9 893 AESENC \TMP3, \XMM2 894 AESENC \TMP3, \XMM3 895 AESENC \TMP3, \XMM4 896 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 897 lea 0xa0(%arg1),%r10 898 mov keysize,%eax 899 shr $2,%eax # 128->4, 192->6, 256->8 900 sub $4,%eax # 128->0, 192->2, 256->4 901 jz aes_loop_par_enc_done 902 903aes_loop_par_enc: 904 MOVADQ (%r10),\TMP3 905.irpc index, 1234 906 AESENC \TMP3, %xmm\index 907.endr 908 add $16,%r10 909 sub $1,%eax 910 jnz aes_loop_par_enc 911 912aes_loop_par_enc_done: 913 MOVADQ (%r10), \TMP3 914 AESENCLAST \TMP3, \XMM1 # Round 10 915 AESENCLAST \TMP3, \XMM2 916 AESENCLAST \TMP3, \XMM3 917 AESENCLAST \TMP3, \XMM4 918 movdqa HashKey_k(%rsp), \TMP5 919 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 920 movdqu (%arg3,%r11,1), \TMP3 921 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 922 movdqu 16(%arg3,%r11,1), \TMP3 923 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 924 movdqu 32(%arg3,%r11,1), \TMP3 925 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 926 movdqu 48(%arg3,%r11,1), \TMP3 927 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 928 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 929 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 930 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 931 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 932 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 933 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 934 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 935 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 936 937 pxor \TMP4, \TMP1 938 pxor \XMM8, \XMM5 939 pxor \TMP6, \TMP2 940 pxor \TMP1, \TMP2 941 pxor \XMM5, \TMP2 942 movdqa \TMP2, \TMP3 943 pslldq $8, \TMP3 # left shift TMP3 2 DWs 944 psrldq $8, \TMP2 # right shift TMP2 2 DWs 945 pxor \TMP3, \XMM5 946 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 947 948 # first phase of reduction 949 950 movdqa \XMM5, \TMP2 951 movdqa \XMM5, \TMP3 952 movdqa \XMM5, \TMP4 953# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 954 pslld $31, \TMP2 # packed right shift << 31 955 pslld $30, \TMP3 # packed right shift << 30 956 pslld $25, \TMP4 # packed right shift << 25 957 pxor \TMP3, \TMP2 # xor the shifted versions 958 pxor \TMP4, \TMP2 959 movdqa \TMP2, \TMP5 960 psrldq $4, \TMP5 # right shift T5 1 DW 961 pslldq $12, \TMP2 # left shift T2 3 DWs 962 pxor \TMP2, \XMM5 963 964 # second phase of reduction 965 966 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 967 movdqa \XMM5,\TMP3 968 movdqa \XMM5,\TMP4 969 psrld $1, \TMP2 # packed left shift >>1 970 psrld $2, \TMP3 # packed left shift >>2 971 psrld $7, \TMP4 # packed left shift >>7 972 pxor \TMP3,\TMP2 # xor the shifted versions 973 pxor \TMP4,\TMP2 974 pxor \TMP5, \TMP2 975 pxor \TMP2, \XMM5 976 pxor \TMP1, \XMM5 # result is in TMP1 977 978 pxor \XMM5, \XMM1 979.endm 980 981/* 982* decrypt 4 blocks at a time 983* ghash the 4 previously decrypted ciphertext blocks 984* arg1, %arg2, %arg3 are used as pointers only, not modified 985* %r11 is the data offset value 986*/ 987.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 988TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 989 990 movdqa \XMM1, \XMM5 991 movdqa \XMM2, \XMM6 992 movdqa \XMM3, \XMM7 993 movdqa \XMM4, \XMM8 994 995 movdqa SHUF_MASK(%rip), %xmm15 996 # multiply TMP5 * HashKey using karatsuba 997 998 movdqa \XMM5, \TMP4 999 pshufd $78, \XMM5, \TMP6 1000 pxor \XMM5, \TMP6 1001 paddd ONE(%rip), \XMM0 # INCR CNT 1002 movdqa HashKey_4(%rsp), \TMP5 1003 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1004 movdqa \XMM0, \XMM1 1005 paddd ONE(%rip), \XMM0 # INCR CNT 1006 movdqa \XMM0, \XMM2 1007 paddd ONE(%rip), \XMM0 # INCR CNT 1008 movdqa \XMM0, \XMM3 1009 paddd ONE(%rip), \XMM0 # INCR CNT 1010 movdqa \XMM0, \XMM4 1011 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1012 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1016 1017 pxor (%arg1), \XMM1 1018 pxor (%arg1), \XMM2 1019 pxor (%arg1), \XMM3 1020 pxor (%arg1), \XMM4 1021 movdqa HashKey_4_k(%rsp), \TMP5 1022 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1023 movaps 0x10(%arg1), \TMP1 1024 AESENC \TMP1, \XMM1 # Round 1 1025 AESENC \TMP1, \XMM2 1026 AESENC \TMP1, \XMM3 1027 AESENC \TMP1, \XMM4 1028 movaps 0x20(%arg1), \TMP1 1029 AESENC \TMP1, \XMM1 # Round 2 1030 AESENC \TMP1, \XMM2 1031 AESENC \TMP1, \XMM3 1032 AESENC \TMP1, \XMM4 1033 movdqa \XMM6, \TMP1 1034 pshufd $78, \XMM6, \TMP2 1035 pxor \XMM6, \TMP2 1036 movdqa HashKey_3(%rsp), \TMP5 1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1038 movaps 0x30(%arg1), \TMP3 1039 AESENC \TMP3, \XMM1 # Round 3 1040 AESENC \TMP3, \XMM2 1041 AESENC \TMP3, \XMM3 1042 AESENC \TMP3, \XMM4 1043 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1044 movaps 0x40(%arg1), \TMP3 1045 AESENC \TMP3, \XMM1 # Round 4 1046 AESENC \TMP3, \XMM2 1047 AESENC \TMP3, \XMM3 1048 AESENC \TMP3, \XMM4 1049 movdqa HashKey_3_k(%rsp), \TMP5 1050 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1051 movaps 0x50(%arg1), \TMP3 1052 AESENC \TMP3, \XMM1 # Round 5 1053 AESENC \TMP3, \XMM2 1054 AESENC \TMP3, \XMM3 1055 AESENC \TMP3, \XMM4 1056 pxor \TMP1, \TMP4 1057# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1058 pxor \XMM6, \XMM5 1059 pxor \TMP2, \TMP6 1060 movdqa \XMM7, \TMP1 1061 pshufd $78, \XMM7, \TMP2 1062 pxor \XMM7, \TMP2 1063 movdqa HashKey_2(%rsp ), \TMP5 1064 1065 # Multiply TMP5 * HashKey using karatsuba 1066 1067 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1068 movaps 0x60(%arg1), \TMP3 1069 AESENC \TMP3, \XMM1 # Round 6 1070 AESENC \TMP3, \XMM2 1071 AESENC \TMP3, \XMM3 1072 AESENC \TMP3, \XMM4 1073 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1074 movaps 0x70(%arg1), \TMP3 1075 AESENC \TMP3, \XMM1 # Round 7 1076 AESENC \TMP3, \XMM2 1077 AESENC \TMP3, \XMM3 1078 AESENC \TMP3, \XMM4 1079 movdqa HashKey_2_k(%rsp), \TMP5 1080 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1081 movaps 0x80(%arg1), \TMP3 1082 AESENC \TMP3, \XMM1 # Round 8 1083 AESENC \TMP3, \XMM2 1084 AESENC \TMP3, \XMM3 1085 AESENC \TMP3, \XMM4 1086 pxor \TMP1, \TMP4 1087# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1088 pxor \XMM7, \XMM5 1089 pxor \TMP2, \TMP6 1090 1091 # Multiply XMM8 * HashKey 1092 # XMM8 and TMP5 hold the values for the two operands 1093 1094 movdqa \XMM8, \TMP1 1095 pshufd $78, \XMM8, \TMP2 1096 pxor \XMM8, \TMP2 1097 movdqa HashKey(%rsp), \TMP5 1098 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1099 movaps 0x90(%arg1), \TMP3 1100 AESENC \TMP3, \XMM1 # Round 9 1101 AESENC \TMP3, \XMM2 1102 AESENC \TMP3, \XMM3 1103 AESENC \TMP3, \XMM4 1104 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1105 lea 0xa0(%arg1),%r10 1106 mov keysize,%eax 1107 shr $2,%eax # 128->4, 192->6, 256->8 1108 sub $4,%eax # 128->0, 192->2, 256->4 1109 jz aes_loop_par_dec_done 1110 1111aes_loop_par_dec: 1112 MOVADQ (%r10),\TMP3 1113.irpc index, 1234 1114 AESENC \TMP3, %xmm\index 1115.endr 1116 add $16,%r10 1117 sub $1,%eax 1118 jnz aes_loop_par_dec 1119 1120aes_loop_par_dec_done: 1121 MOVADQ (%r10), \TMP3 1122 AESENCLAST \TMP3, \XMM1 # last round 1123 AESENCLAST \TMP3, \XMM2 1124 AESENCLAST \TMP3, \XMM3 1125 AESENCLAST \TMP3, \XMM4 1126 movdqa HashKey_k(%rsp), \TMP5 1127 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1128 movdqu (%arg3,%r11,1), \TMP3 1129 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1130 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1131 movdqa \TMP3, \XMM1 1132 movdqu 16(%arg3,%r11,1), \TMP3 1133 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1134 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1135 movdqa \TMP3, \XMM2 1136 movdqu 32(%arg3,%r11,1), \TMP3 1137 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1138 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1139 movdqa \TMP3, \XMM3 1140 movdqu 48(%arg3,%r11,1), \TMP3 1141 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1142 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1143 movdqa \TMP3, \XMM4 1144 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1145 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1146 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1147 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1148 1149 pxor \TMP4, \TMP1 1150 pxor \XMM8, \XMM5 1151 pxor \TMP6, \TMP2 1152 pxor \TMP1, \TMP2 1153 pxor \XMM5, \TMP2 1154 movdqa \TMP2, \TMP3 1155 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1156 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1157 pxor \TMP3, \XMM5 1158 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1159 1160 # first phase of reduction 1161 1162 movdqa \XMM5, \TMP2 1163 movdqa \XMM5, \TMP3 1164 movdqa \XMM5, \TMP4 1165# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1166 pslld $31, \TMP2 # packed right shift << 31 1167 pslld $30, \TMP3 # packed right shift << 30 1168 pslld $25, \TMP4 # packed right shift << 25 1169 pxor \TMP3, \TMP2 # xor the shifted versions 1170 pxor \TMP4, \TMP2 1171 movdqa \TMP2, \TMP5 1172 psrldq $4, \TMP5 # right shift T5 1 DW 1173 pslldq $12, \TMP2 # left shift T2 3 DWs 1174 pxor \TMP2, \XMM5 1175 1176 # second phase of reduction 1177 1178 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1179 movdqa \XMM5,\TMP3 1180 movdqa \XMM5,\TMP4 1181 psrld $1, \TMP2 # packed left shift >>1 1182 psrld $2, \TMP3 # packed left shift >>2 1183 psrld $7, \TMP4 # packed left shift >>7 1184 pxor \TMP3,\TMP2 # xor the shifted versions 1185 pxor \TMP4,\TMP2 1186 pxor \TMP5, \TMP2 1187 pxor \TMP2, \XMM5 1188 pxor \TMP1, \XMM5 # result is in TMP1 1189 1190 pxor \XMM5, \XMM1 1191.endm 1192 1193/* GHASH the last 4 ciphertext blocks. */ 1194.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1195TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1196 1197 # Multiply TMP6 * HashKey (using Karatsuba) 1198 1199 movdqa \XMM1, \TMP6 1200 pshufd $78, \XMM1, \TMP2 1201 pxor \XMM1, \TMP2 1202 movdqa HashKey_4(%rsp), \TMP5 1203 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1204 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1205 movdqa HashKey_4_k(%rsp), \TMP4 1206 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1207 movdqa \XMM1, \XMMDst 1208 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1209 1210 # Multiply TMP1 * HashKey (using Karatsuba) 1211 1212 movdqa \XMM2, \TMP1 1213 pshufd $78, \XMM2, \TMP2 1214 pxor \XMM2, \TMP2 1215 movdqa HashKey_3(%rsp), \TMP5 1216 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1217 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1218 movdqa HashKey_3_k(%rsp), \TMP4 1219 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1220 pxor \TMP1, \TMP6 1221 pxor \XMM2, \XMMDst 1222 pxor \TMP2, \XMM1 1223# results accumulated in TMP6, XMMDst, XMM1 1224 1225 # Multiply TMP1 * HashKey (using Karatsuba) 1226 1227 movdqa \XMM3, \TMP1 1228 pshufd $78, \XMM3, \TMP2 1229 pxor \XMM3, \TMP2 1230 movdqa HashKey_2(%rsp), \TMP5 1231 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1232 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1233 movdqa HashKey_2_k(%rsp), \TMP4 1234 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1235 pxor \TMP1, \TMP6 1236 pxor \XMM3, \XMMDst 1237 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1238 1239 # Multiply TMP1 * HashKey (using Karatsuba) 1240 movdqa \XMM4, \TMP1 1241 pshufd $78, \XMM4, \TMP2 1242 pxor \XMM4, \TMP2 1243 movdqa HashKey(%rsp), \TMP5 1244 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1245 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1246 movdqa HashKey_k(%rsp), \TMP4 1247 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1248 pxor \TMP1, \TMP6 1249 pxor \XMM4, \XMMDst 1250 pxor \XMM1, \TMP2 1251 pxor \TMP6, \TMP2 1252 pxor \XMMDst, \TMP2 1253 # middle section of the temp results combined as in karatsuba algorithm 1254 movdqa \TMP2, \TMP4 1255 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1256 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1257 pxor \TMP4, \XMMDst 1258 pxor \TMP2, \TMP6 1259# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1260 # first phase of the reduction 1261 movdqa \XMMDst, \TMP2 1262 movdqa \XMMDst, \TMP3 1263 movdqa \XMMDst, \TMP4 1264# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1265 pslld $31, \TMP2 # packed right shifting << 31 1266 pslld $30, \TMP3 # packed right shifting << 30 1267 pslld $25, \TMP4 # packed right shifting << 25 1268 pxor \TMP3, \TMP2 # xor the shifted versions 1269 pxor \TMP4, \TMP2 1270 movdqa \TMP2, \TMP7 1271 psrldq $4, \TMP7 # right shift TMP7 1 DW 1272 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1273 pxor \TMP2, \XMMDst 1274 1275 # second phase of the reduction 1276 movdqa \XMMDst, \TMP2 1277 # make 3 copies of XMMDst for doing 3 shift operations 1278 movdqa \XMMDst, \TMP3 1279 movdqa \XMMDst, \TMP4 1280 psrld $1, \TMP2 # packed left shift >> 1 1281 psrld $2, \TMP3 # packed left shift >> 2 1282 psrld $7, \TMP4 # packed left shift >> 7 1283 pxor \TMP3, \TMP2 # xor the shifted versions 1284 pxor \TMP4, \TMP2 1285 pxor \TMP7, \TMP2 1286 pxor \TMP2, \XMMDst 1287 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1288.endm 1289 1290 1291/* Encryption of a single block 1292* uses eax & r10 1293*/ 1294 1295.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1296 1297 pxor (%arg1), \XMM0 1298 mov keysize,%eax 1299 shr $2,%eax # 128->4, 192->6, 256->8 1300 add $5,%eax # 128->9, 192->11, 256->13 1301 lea 16(%arg1), %r10 # get first expanded key address 1302 1303_esb_loop_\@: 1304 MOVADQ (%r10),\TMP1 1305 AESENC \TMP1,\XMM0 1306 add $16,%r10 1307 sub $1,%eax 1308 jnz _esb_loop_\@ 1309 1310 MOVADQ (%r10),\TMP1 1311 AESENCLAST \TMP1,\XMM0 1312.endm 1313/***************************************************************************** 1314* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1315* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1316* const u8 *in, // Ciphertext input 1317* u64 plaintext_len, // Length of data in bytes for decryption. 1318* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1319* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1320* // concatenated with 0x00000001. 16-byte aligned pointer. 1321* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1322* const u8 *aad, // Additional Authentication Data (AAD) 1323* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1324* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1325* // given authentication tag and only return the plaintext if they match. 1326* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1327* // (most likely), 12 or 8. 1328* 1329* Assumptions: 1330* 1331* keys: 1332* keys are pre-expanded and aligned to 16 bytes. we are using the first 1333* set of 11 keys in the data structure void *aes_ctx 1334* 1335* iv: 1336* 0 1 2 3 1337* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1338* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1339* | Salt (From the SA) | 1340* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1341* | Initialization Vector | 1342* | (This is the sequence number from IPSec header) | 1343* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1344* | 0x1 | 1345* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1346* 1347* 1348* 1349* AAD: 1350* AAD padded to 128 bits with 0 1351* for example, assume AAD is a u32 vector 1352* 1353* if AAD is 8 bytes: 1354* AAD[3] = {A0, A1}; 1355* padded AAD in xmm register = {A1 A0 0 0} 1356* 1357* 0 1 2 3 1358* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1359* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1360* | SPI (A1) | 1361* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1362* | 32-bit Sequence Number (A0) | 1363* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1364* | 0x0 | 1365* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1366* 1367* AAD Format with 32-bit Sequence Number 1368* 1369* if AAD is 12 bytes: 1370* AAD[3] = {A0, A1, A2}; 1371* padded AAD in xmm register = {A2 A1 A0 0} 1372* 1373* 0 1 2 3 1374* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1375* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1376* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1377* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1378* | SPI (A2) | 1379* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1380* | 64-bit Extended Sequence Number {A1,A0} | 1381* | | 1382* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1383* | 0x0 | 1384* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1385* 1386* AAD Format with 64-bit Extended Sequence Number 1387* 1388* aadLen: 1389* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1390* The code supports 16 too but for other sizes, the code will fail. 1391* 1392* TLen: 1393* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1394* For other sizes, the code will fail. 1395* 1396* poly = x^128 + x^127 + x^126 + x^121 + 1 1397* 1398*****************************************************************************/ 1399ENTRY(aesni_gcm_dec) 1400 push %r12 1401 push %r13 1402 push %r14 1403 mov %rsp, %r14 1404/* 1405* states of %xmm registers %xmm6:%xmm15 not saved 1406* all %xmm registers are clobbered 1407*/ 1408 sub $VARIABLE_OFFSET, %rsp 1409 and $~63, %rsp # align rsp to 64 bytes 1410 mov %arg6, %r12 1411 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1412 movdqa SHUF_MASK(%rip), %xmm2 1413 PSHUFB_XMM %xmm2, %xmm13 1414 1415 1416# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1417 1418 movdqa %xmm13, %xmm2 1419 psllq $1, %xmm13 1420 psrlq $63, %xmm2 1421 movdqa %xmm2, %xmm1 1422 pslldq $8, %xmm2 1423 psrldq $8, %xmm1 1424 por %xmm2, %xmm13 1425 1426 # Reduction 1427 1428 pshufd $0x24, %xmm1, %xmm2 1429 pcmpeqd TWOONE(%rip), %xmm2 1430 pand POLY(%rip), %xmm2 1431 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1432 1433 1434 # Decrypt first few blocks 1435 1436 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1437 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1438 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1439 mov %r13, %r12 1440 and $(3<<4), %r12 1441 jz _initial_num_blocks_is_0_decrypt 1442 cmp $(2<<4), %r12 1443 jb _initial_num_blocks_is_1_decrypt 1444 je _initial_num_blocks_is_2_decrypt 1445_initial_num_blocks_is_3_decrypt: 1446 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1447%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1448 sub $48, %r13 1449 jmp _initial_blocks_decrypted 1450_initial_num_blocks_is_2_decrypt: 1451 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1452%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1453 sub $32, %r13 1454 jmp _initial_blocks_decrypted 1455_initial_num_blocks_is_1_decrypt: 1456 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1457%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1458 sub $16, %r13 1459 jmp _initial_blocks_decrypted 1460_initial_num_blocks_is_0_decrypt: 1461 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1462%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1463_initial_blocks_decrypted: 1464 cmp $0, %r13 1465 je _zero_cipher_left_decrypt 1466 sub $64, %r13 1467 je _four_cipher_left_decrypt 1468_decrypt_by_4: 1469 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1470%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1471 add $64, %r11 1472 sub $64, %r13 1473 jne _decrypt_by_4 1474_four_cipher_left_decrypt: 1475 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1476%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1477_zero_cipher_left_decrypt: 1478 mov %arg4, %r13 1479 and $15, %r13 # %r13 = arg4 (mod 16) 1480 je _multiple_of_16_bytes_decrypt 1481 1482 # Handle the last <16 byte block separately 1483 1484 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1485 movdqa SHUF_MASK(%rip), %xmm10 1486 PSHUFB_XMM %xmm10, %xmm0 1487 1488 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1489 sub $16, %r11 1490 add %r13, %r11 1491 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1492 lea SHIFT_MASK+16(%rip), %r12 1493 sub %r13, %r12 1494# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1495# (%r13 is the number of bytes in plaintext mod 16) 1496 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1497 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1498 1499 movdqa %xmm1, %xmm2 1500 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1501 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1502 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1503 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1504 pand %xmm1, %xmm2 1505 movdqa SHUF_MASK(%rip), %xmm10 1506 PSHUFB_XMM %xmm10 ,%xmm2 1507 1508 pxor %xmm2, %xmm8 1509 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1510 # GHASH computation for the last <16 byte block 1511 sub %r13, %r11 1512 add $16, %r11 1513 1514 # output %r13 bytes 1515 MOVQ_R64_XMM %xmm0, %rax 1516 cmp $8, %r13 1517 jle _less_than_8_bytes_left_decrypt 1518 mov %rax, (%arg2 , %r11, 1) 1519 add $8, %r11 1520 psrldq $8, %xmm0 1521 MOVQ_R64_XMM %xmm0, %rax 1522 sub $8, %r13 1523_less_than_8_bytes_left_decrypt: 1524 mov %al, (%arg2, %r11, 1) 1525 add $1, %r11 1526 shr $8, %rax 1527 sub $1, %r13 1528 jne _less_than_8_bytes_left_decrypt 1529_multiple_of_16_bytes_decrypt: 1530 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1531 shl $3, %r12 # convert into number of bits 1532 movd %r12d, %xmm15 # len(A) in %xmm15 1533 shl $3, %arg4 # len(C) in bits (*128) 1534 MOVQ_R64_XMM %arg4, %xmm1 1535 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1536 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1537 pxor %xmm15, %xmm8 1538 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1539 # final GHASH computation 1540 movdqa SHUF_MASK(%rip), %xmm10 1541 PSHUFB_XMM %xmm10, %xmm8 1542 1543 mov %arg5, %rax # %rax = *Y0 1544 movdqu (%rax), %xmm0 # %xmm0 = Y0 1545 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1546 pxor %xmm8, %xmm0 1547_return_T_decrypt: 1548 mov arg9, %r10 # %r10 = authTag 1549 mov arg10, %r11 # %r11 = auth_tag_len 1550 cmp $16, %r11 1551 je _T_16_decrypt 1552 cmp $8, %r11 1553 jl _T_4_decrypt 1554_T_8_decrypt: 1555 MOVQ_R64_XMM %xmm0, %rax 1556 mov %rax, (%r10) 1557 add $8, %r10 1558 sub $8, %r11 1559 psrldq $8, %xmm0 1560 cmp $0, %r11 1561 je _return_T_done_decrypt 1562_T_4_decrypt: 1563 movd %xmm0, %eax 1564 mov %eax, (%r10) 1565 add $4, %r10 1566 sub $4, %r11 1567 psrldq $4, %xmm0 1568 cmp $0, %r11 1569 je _return_T_done_decrypt 1570_T_123_decrypt: 1571 movd %xmm0, %eax 1572 cmp $2, %r11 1573 jl _T_1_decrypt 1574 mov %ax, (%r10) 1575 cmp $2, %r11 1576 je _return_T_done_decrypt 1577 add $2, %r10 1578 sar $16, %eax 1579_T_1_decrypt: 1580 mov %al, (%r10) 1581 jmp _return_T_done_decrypt 1582_T_16_decrypt: 1583 movdqu %xmm0, (%r10) 1584_return_T_done_decrypt: 1585 mov %r14, %rsp 1586 pop %r14 1587 pop %r13 1588 pop %r12 1589 ret 1590ENDPROC(aesni_gcm_dec) 1591 1592 1593/***************************************************************************** 1594* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1595* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1596* const u8 *in, // Plaintext input 1597* u64 plaintext_len, // Length of data in bytes for encryption. 1598* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1599* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1600* // concatenated with 0x00000001. 16-byte aligned pointer. 1601* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1602* const u8 *aad, // Additional Authentication Data (AAD) 1603* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1604* u8 *auth_tag, // Authenticated Tag output. 1605* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1606* // 12 or 8. 1607* 1608* Assumptions: 1609* 1610* keys: 1611* keys are pre-expanded and aligned to 16 bytes. we are using the 1612* first set of 11 keys in the data structure void *aes_ctx 1613* 1614* 1615* iv: 1616* 0 1 2 3 1617* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1618* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1619* | Salt (From the SA) | 1620* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1621* | Initialization Vector | 1622* | (This is the sequence number from IPSec header) | 1623* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1624* | 0x1 | 1625* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1626* 1627* 1628* 1629* AAD: 1630* AAD padded to 128 bits with 0 1631* for example, assume AAD is a u32 vector 1632* 1633* if AAD is 8 bytes: 1634* AAD[3] = {A0, A1}; 1635* padded AAD in xmm register = {A1 A0 0 0} 1636* 1637* 0 1 2 3 1638* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1640* | SPI (A1) | 1641* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1642* | 32-bit Sequence Number (A0) | 1643* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1644* | 0x0 | 1645* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1646* 1647* AAD Format with 32-bit Sequence Number 1648* 1649* if AAD is 12 bytes: 1650* AAD[3] = {A0, A1, A2}; 1651* padded AAD in xmm register = {A2 A1 A0 0} 1652* 1653* 0 1 2 3 1654* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1656* | SPI (A2) | 1657* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1658* | 64-bit Extended Sequence Number {A1,A0} | 1659* | | 1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1661* | 0x0 | 1662* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1663* 1664* AAD Format with 64-bit Extended Sequence Number 1665* 1666* aadLen: 1667* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1668* The code supports 16 too but for other sizes, the code will fail. 1669* 1670* TLen: 1671* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1672* For other sizes, the code will fail. 1673* 1674* poly = x^128 + x^127 + x^126 + x^121 + 1 1675***************************************************************************/ 1676ENTRY(aesni_gcm_enc) 1677 push %r12 1678 push %r13 1679 push %r14 1680 mov %rsp, %r14 1681# 1682# states of %xmm registers %xmm6:%xmm15 not saved 1683# all %xmm registers are clobbered 1684# 1685 sub $VARIABLE_OFFSET, %rsp 1686 and $~63, %rsp 1687 mov %arg6, %r12 1688 movdqu (%r12), %xmm13 1689 movdqa SHUF_MASK(%rip), %xmm2 1690 PSHUFB_XMM %xmm2, %xmm13 1691 1692 1693# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1694 1695 movdqa %xmm13, %xmm2 1696 psllq $1, %xmm13 1697 psrlq $63, %xmm2 1698 movdqa %xmm2, %xmm1 1699 pslldq $8, %xmm2 1700 psrldq $8, %xmm1 1701 por %xmm2, %xmm13 1702 1703 # reduce HashKey<<1 1704 1705 pshufd $0x24, %xmm1, %xmm2 1706 pcmpeqd TWOONE(%rip), %xmm2 1707 pand POLY(%rip), %xmm2 1708 pxor %xmm2, %xmm13 1709 movdqa %xmm13, HashKey(%rsp) 1710 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1711 and $-16, %r13 1712 mov %r13, %r12 1713 1714 # Encrypt first few blocks 1715 1716 and $(3<<4), %r12 1717 jz _initial_num_blocks_is_0_encrypt 1718 cmp $(2<<4), %r12 1719 jb _initial_num_blocks_is_1_encrypt 1720 je _initial_num_blocks_is_2_encrypt 1721_initial_num_blocks_is_3_encrypt: 1722 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1723%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1724 sub $48, %r13 1725 jmp _initial_blocks_encrypted 1726_initial_num_blocks_is_2_encrypt: 1727 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1728%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1729 sub $32, %r13 1730 jmp _initial_blocks_encrypted 1731_initial_num_blocks_is_1_encrypt: 1732 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1733%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1734 sub $16, %r13 1735 jmp _initial_blocks_encrypted 1736_initial_num_blocks_is_0_encrypt: 1737 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1738%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1739_initial_blocks_encrypted: 1740 1741 # Main loop - Encrypt remaining blocks 1742 1743 cmp $0, %r13 1744 je _zero_cipher_left_encrypt 1745 sub $64, %r13 1746 je _four_cipher_left_encrypt 1747_encrypt_by_4_encrypt: 1748 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1749%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1750 add $64, %r11 1751 sub $64, %r13 1752 jne _encrypt_by_4_encrypt 1753_four_cipher_left_encrypt: 1754 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1755%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1756_zero_cipher_left_encrypt: 1757 mov %arg4, %r13 1758 and $15, %r13 # %r13 = arg4 (mod 16) 1759 je _multiple_of_16_bytes_encrypt 1760 1761 # Handle the last <16 Byte block separately 1762 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1763 movdqa SHUF_MASK(%rip), %xmm10 1764 PSHUFB_XMM %xmm10, %xmm0 1765 1766 1767 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1768 sub $16, %r11 1769 add %r13, %r11 1770 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1771 lea SHIFT_MASK+16(%rip), %r12 1772 sub %r13, %r12 1773 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1774 # (%r13 is the number of bytes in plaintext mod 16) 1775 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1776 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1777 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1778 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1779 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1780 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1781 movdqa SHUF_MASK(%rip), %xmm10 1782 PSHUFB_XMM %xmm10,%xmm0 1783 1784 pxor %xmm0, %xmm8 1785 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1786 # GHASH computation for the last <16 byte block 1787 sub %r13, %r11 1788 add $16, %r11 1789 1790 movdqa SHUF_MASK(%rip), %xmm10 1791 PSHUFB_XMM %xmm10, %xmm0 1792 1793 # shuffle xmm0 back to output as ciphertext 1794 1795 # Output %r13 bytes 1796 MOVQ_R64_XMM %xmm0, %rax 1797 cmp $8, %r13 1798 jle _less_than_8_bytes_left_encrypt 1799 mov %rax, (%arg2 , %r11, 1) 1800 add $8, %r11 1801 psrldq $8, %xmm0 1802 MOVQ_R64_XMM %xmm0, %rax 1803 sub $8, %r13 1804_less_than_8_bytes_left_encrypt: 1805 mov %al, (%arg2, %r11, 1) 1806 add $1, %r11 1807 shr $8, %rax 1808 sub $1, %r13 1809 jne _less_than_8_bytes_left_encrypt 1810_multiple_of_16_bytes_encrypt: 1811 mov arg8, %r12 # %r12 = addLen (number of bytes) 1812 shl $3, %r12 1813 movd %r12d, %xmm15 # len(A) in %xmm15 1814 shl $3, %arg4 # len(C) in bits (*128) 1815 MOVQ_R64_XMM %arg4, %xmm1 1816 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1817 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1818 pxor %xmm15, %xmm8 1819 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1820 # final GHASH computation 1821 movdqa SHUF_MASK(%rip), %xmm10 1822 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1823 1824 mov %arg5, %rax # %rax = *Y0 1825 movdqu (%rax), %xmm0 # %xmm0 = Y0 1826 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1827 pxor %xmm8, %xmm0 1828_return_T_encrypt: 1829 mov arg9, %r10 # %r10 = authTag 1830 mov arg10, %r11 # %r11 = auth_tag_len 1831 cmp $16, %r11 1832 je _T_16_encrypt 1833 cmp $8, %r11 1834 jl _T_4_encrypt 1835_T_8_encrypt: 1836 MOVQ_R64_XMM %xmm0, %rax 1837 mov %rax, (%r10) 1838 add $8, %r10 1839 sub $8, %r11 1840 psrldq $8, %xmm0 1841 cmp $0, %r11 1842 je _return_T_done_encrypt 1843_T_4_encrypt: 1844 movd %xmm0, %eax 1845 mov %eax, (%r10) 1846 add $4, %r10 1847 sub $4, %r11 1848 psrldq $4, %xmm0 1849 cmp $0, %r11 1850 je _return_T_done_encrypt 1851_T_123_encrypt: 1852 movd %xmm0, %eax 1853 cmp $2, %r11 1854 jl _T_1_encrypt 1855 mov %ax, (%r10) 1856 cmp $2, %r11 1857 je _return_T_done_encrypt 1858 add $2, %r10 1859 sar $16, %eax 1860_T_1_encrypt: 1861 mov %al, (%r10) 1862 jmp _return_T_done_encrypt 1863_T_16_encrypt: 1864 movdqu %xmm0, (%r10) 1865_return_T_done_encrypt: 1866 mov %r14, %rsp 1867 pop %r14 1868 pop %r13 1869 pop %r12 1870 ret 1871ENDPROC(aesni_gcm_enc) 1872 1873#endif 1874 1875 1876.align 4 1877_key_expansion_128: 1878_key_expansion_256a: 1879 pshufd $0b11111111, %xmm1, %xmm1 1880 shufps $0b00010000, %xmm0, %xmm4 1881 pxor %xmm4, %xmm0 1882 shufps $0b10001100, %xmm0, %xmm4 1883 pxor %xmm4, %xmm0 1884 pxor %xmm1, %xmm0 1885 movaps %xmm0, (TKEYP) 1886 add $0x10, TKEYP 1887 ret 1888ENDPROC(_key_expansion_128) 1889ENDPROC(_key_expansion_256a) 1890 1891.align 4 1892_key_expansion_192a: 1893 pshufd $0b01010101, %xmm1, %xmm1 1894 shufps $0b00010000, %xmm0, %xmm4 1895 pxor %xmm4, %xmm0 1896 shufps $0b10001100, %xmm0, %xmm4 1897 pxor %xmm4, %xmm0 1898 pxor %xmm1, %xmm0 1899 1900 movaps %xmm2, %xmm5 1901 movaps %xmm2, %xmm6 1902 pslldq $4, %xmm5 1903 pshufd $0b11111111, %xmm0, %xmm3 1904 pxor %xmm3, %xmm2 1905 pxor %xmm5, %xmm2 1906 1907 movaps %xmm0, %xmm1 1908 shufps $0b01000100, %xmm0, %xmm6 1909 movaps %xmm6, (TKEYP) 1910 shufps $0b01001110, %xmm2, %xmm1 1911 movaps %xmm1, 0x10(TKEYP) 1912 add $0x20, TKEYP 1913 ret 1914ENDPROC(_key_expansion_192a) 1915 1916.align 4 1917_key_expansion_192b: 1918 pshufd $0b01010101, %xmm1, %xmm1 1919 shufps $0b00010000, %xmm0, %xmm4 1920 pxor %xmm4, %xmm0 1921 shufps $0b10001100, %xmm0, %xmm4 1922 pxor %xmm4, %xmm0 1923 pxor %xmm1, %xmm0 1924 1925 movaps %xmm2, %xmm5 1926 pslldq $4, %xmm5 1927 pshufd $0b11111111, %xmm0, %xmm3 1928 pxor %xmm3, %xmm2 1929 pxor %xmm5, %xmm2 1930 1931 movaps %xmm0, (TKEYP) 1932 add $0x10, TKEYP 1933 ret 1934ENDPROC(_key_expansion_192b) 1935 1936.align 4 1937_key_expansion_256b: 1938 pshufd $0b10101010, %xmm1, %xmm1 1939 shufps $0b00010000, %xmm2, %xmm4 1940 pxor %xmm4, %xmm2 1941 shufps $0b10001100, %xmm2, %xmm4 1942 pxor %xmm4, %xmm2 1943 pxor %xmm1, %xmm2 1944 movaps %xmm2, (TKEYP) 1945 add $0x10, TKEYP 1946 ret 1947ENDPROC(_key_expansion_256b) 1948 1949/* 1950 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1951 * unsigned int key_len) 1952 */ 1953ENTRY(aesni_set_key) 1954 FRAME_BEGIN 1955#ifndef __x86_64__ 1956 pushl KEYP 1957 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1958 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1959 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1960#endif 1961 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1962 movaps %xmm0, (KEYP) 1963 lea 0x10(KEYP), TKEYP # key addr 1964 movl %edx, 480(KEYP) 1965 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1966 cmp $24, %dl 1967 jb .Lenc_key128 1968 je .Lenc_key192 1969 movups 0x10(UKEYP), %xmm2 # other user key 1970 movaps %xmm2, (TKEYP) 1971 add $0x10, TKEYP 1972 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1973 call _key_expansion_256a 1974 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1975 call _key_expansion_256b 1976 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1977 call _key_expansion_256a 1978 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1979 call _key_expansion_256b 1980 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1981 call _key_expansion_256a 1982 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1983 call _key_expansion_256b 1984 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1985 call _key_expansion_256a 1986 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1987 call _key_expansion_256b 1988 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1989 call _key_expansion_256a 1990 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1991 call _key_expansion_256b 1992 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1993 call _key_expansion_256a 1994 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1995 call _key_expansion_256b 1996 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1997 call _key_expansion_256a 1998 jmp .Ldec_key 1999.Lenc_key192: 2000 movq 0x10(UKEYP), %xmm2 # other user key 2001 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 2002 call _key_expansion_192a 2003 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 2004 call _key_expansion_192b 2005 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 2006 call _key_expansion_192a 2007 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 2008 call _key_expansion_192b 2009 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 2010 call _key_expansion_192a 2011 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 2012 call _key_expansion_192b 2013 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 2014 call _key_expansion_192a 2015 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 2016 call _key_expansion_192b 2017 jmp .Ldec_key 2018.Lenc_key128: 2019 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 2020 call _key_expansion_128 2021 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 2022 call _key_expansion_128 2023 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 2024 call _key_expansion_128 2025 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 2026 call _key_expansion_128 2027 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 2028 call _key_expansion_128 2029 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 2030 call _key_expansion_128 2031 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 2032 call _key_expansion_128 2033 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 2034 call _key_expansion_128 2035 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 2036 call _key_expansion_128 2037 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 2038 call _key_expansion_128 2039.Ldec_key: 2040 sub $0x10, TKEYP 2041 movaps (KEYP), %xmm0 2042 movaps (TKEYP), %xmm1 2043 movaps %xmm0, 240(TKEYP) 2044 movaps %xmm1, 240(KEYP) 2045 add $0x10, KEYP 2046 lea 240-16(TKEYP), UKEYP 2047.align 4 2048.Ldec_key_loop: 2049 movaps (KEYP), %xmm0 2050 AESIMC %xmm0 %xmm1 2051 movaps %xmm1, (UKEYP) 2052 add $0x10, KEYP 2053 sub $0x10, UKEYP 2054 cmp TKEYP, KEYP 2055 jb .Ldec_key_loop 2056 xor AREG, AREG 2057#ifndef __x86_64__ 2058 popl KEYP 2059#endif 2060 FRAME_END 2061 ret 2062ENDPROC(aesni_set_key) 2063 2064/* 2065 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2066 */ 2067ENTRY(aesni_enc) 2068 FRAME_BEGIN 2069#ifndef __x86_64__ 2070 pushl KEYP 2071 pushl KLEN 2072 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2073 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2074 movl (FRAME_OFFSET+20)(%esp), INP # src 2075#endif 2076 movl 480(KEYP), KLEN # key length 2077 movups (INP), STATE # input 2078 call _aesni_enc1 2079 movups STATE, (OUTP) # output 2080#ifndef __x86_64__ 2081 popl KLEN 2082 popl KEYP 2083#endif 2084 FRAME_END 2085 ret 2086ENDPROC(aesni_enc) 2087 2088/* 2089 * _aesni_enc1: internal ABI 2090 * input: 2091 * KEYP: key struct pointer 2092 * KLEN: round count 2093 * STATE: initial state (input) 2094 * output: 2095 * STATE: finial state (output) 2096 * changed: 2097 * KEY 2098 * TKEYP (T1) 2099 */ 2100.align 4 2101_aesni_enc1: 2102 movaps (KEYP), KEY # key 2103 mov KEYP, TKEYP 2104 pxor KEY, STATE # round 0 2105 add $0x30, TKEYP 2106 cmp $24, KLEN 2107 jb .Lenc128 2108 lea 0x20(TKEYP), TKEYP 2109 je .Lenc192 2110 add $0x20, TKEYP 2111 movaps -0x60(TKEYP), KEY 2112 AESENC KEY STATE 2113 movaps -0x50(TKEYP), KEY 2114 AESENC KEY STATE 2115.align 4 2116.Lenc192: 2117 movaps -0x40(TKEYP), KEY 2118 AESENC KEY STATE 2119 movaps -0x30(TKEYP), KEY 2120 AESENC KEY STATE 2121.align 4 2122.Lenc128: 2123 movaps -0x20(TKEYP), KEY 2124 AESENC KEY STATE 2125 movaps -0x10(TKEYP), KEY 2126 AESENC KEY STATE 2127 movaps (TKEYP), KEY 2128 AESENC KEY STATE 2129 movaps 0x10(TKEYP), KEY 2130 AESENC KEY STATE 2131 movaps 0x20(TKEYP), KEY 2132 AESENC KEY STATE 2133 movaps 0x30(TKEYP), KEY 2134 AESENC KEY STATE 2135 movaps 0x40(TKEYP), KEY 2136 AESENC KEY STATE 2137 movaps 0x50(TKEYP), KEY 2138 AESENC KEY STATE 2139 movaps 0x60(TKEYP), KEY 2140 AESENC KEY STATE 2141 movaps 0x70(TKEYP), KEY 2142 AESENCLAST KEY STATE 2143 ret 2144ENDPROC(_aesni_enc1) 2145 2146/* 2147 * _aesni_enc4: internal ABI 2148 * input: 2149 * KEYP: key struct pointer 2150 * KLEN: round count 2151 * STATE1: initial state (input) 2152 * STATE2 2153 * STATE3 2154 * STATE4 2155 * output: 2156 * STATE1: finial state (output) 2157 * STATE2 2158 * STATE3 2159 * STATE4 2160 * changed: 2161 * KEY 2162 * TKEYP (T1) 2163 */ 2164.align 4 2165_aesni_enc4: 2166 movaps (KEYP), KEY # key 2167 mov KEYP, TKEYP 2168 pxor KEY, STATE1 # round 0 2169 pxor KEY, STATE2 2170 pxor KEY, STATE3 2171 pxor KEY, STATE4 2172 add $0x30, TKEYP 2173 cmp $24, KLEN 2174 jb .L4enc128 2175 lea 0x20(TKEYP), TKEYP 2176 je .L4enc192 2177 add $0x20, TKEYP 2178 movaps -0x60(TKEYP), KEY 2179 AESENC KEY STATE1 2180 AESENC KEY STATE2 2181 AESENC KEY STATE3 2182 AESENC KEY STATE4 2183 movaps -0x50(TKEYP), KEY 2184 AESENC KEY STATE1 2185 AESENC KEY STATE2 2186 AESENC KEY STATE3 2187 AESENC KEY STATE4 2188#.align 4 2189.L4enc192: 2190 movaps -0x40(TKEYP), KEY 2191 AESENC KEY STATE1 2192 AESENC KEY STATE2 2193 AESENC KEY STATE3 2194 AESENC KEY STATE4 2195 movaps -0x30(TKEYP), KEY 2196 AESENC KEY STATE1 2197 AESENC KEY STATE2 2198 AESENC KEY STATE3 2199 AESENC KEY STATE4 2200#.align 4 2201.L4enc128: 2202 movaps -0x20(TKEYP), KEY 2203 AESENC KEY STATE1 2204 AESENC KEY STATE2 2205 AESENC KEY STATE3 2206 AESENC KEY STATE4 2207 movaps -0x10(TKEYP), KEY 2208 AESENC KEY STATE1 2209 AESENC KEY STATE2 2210 AESENC KEY STATE3 2211 AESENC KEY STATE4 2212 movaps (TKEYP), KEY 2213 AESENC KEY STATE1 2214 AESENC KEY STATE2 2215 AESENC KEY STATE3 2216 AESENC KEY STATE4 2217 movaps 0x10(TKEYP), KEY 2218 AESENC KEY STATE1 2219 AESENC KEY STATE2 2220 AESENC KEY STATE3 2221 AESENC KEY STATE4 2222 movaps 0x20(TKEYP), KEY 2223 AESENC KEY STATE1 2224 AESENC KEY STATE2 2225 AESENC KEY STATE3 2226 AESENC KEY STATE4 2227 movaps 0x30(TKEYP), KEY 2228 AESENC KEY STATE1 2229 AESENC KEY STATE2 2230 AESENC KEY STATE3 2231 AESENC KEY STATE4 2232 movaps 0x40(TKEYP), KEY 2233 AESENC KEY STATE1 2234 AESENC KEY STATE2 2235 AESENC KEY STATE3 2236 AESENC KEY STATE4 2237 movaps 0x50(TKEYP), KEY 2238 AESENC KEY STATE1 2239 AESENC KEY STATE2 2240 AESENC KEY STATE3 2241 AESENC KEY STATE4 2242 movaps 0x60(TKEYP), KEY 2243 AESENC KEY STATE1 2244 AESENC KEY STATE2 2245 AESENC KEY STATE3 2246 AESENC KEY STATE4 2247 movaps 0x70(TKEYP), KEY 2248 AESENCLAST KEY STATE1 # last round 2249 AESENCLAST KEY STATE2 2250 AESENCLAST KEY STATE3 2251 AESENCLAST KEY STATE4 2252 ret 2253ENDPROC(_aesni_enc4) 2254 2255/* 2256 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2257 */ 2258ENTRY(aesni_dec) 2259 FRAME_BEGIN 2260#ifndef __x86_64__ 2261 pushl KEYP 2262 pushl KLEN 2263 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2264 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2265 movl (FRAME_OFFSET+20)(%esp), INP # src 2266#endif 2267 mov 480(KEYP), KLEN # key length 2268 add $240, KEYP 2269 movups (INP), STATE # input 2270 call _aesni_dec1 2271 movups STATE, (OUTP) #output 2272#ifndef __x86_64__ 2273 popl KLEN 2274 popl KEYP 2275#endif 2276 FRAME_END 2277 ret 2278ENDPROC(aesni_dec) 2279 2280/* 2281 * _aesni_dec1: internal ABI 2282 * input: 2283 * KEYP: key struct pointer 2284 * KLEN: key length 2285 * STATE: initial state (input) 2286 * output: 2287 * STATE: finial state (output) 2288 * changed: 2289 * KEY 2290 * TKEYP (T1) 2291 */ 2292.align 4 2293_aesni_dec1: 2294 movaps (KEYP), KEY # key 2295 mov KEYP, TKEYP 2296 pxor KEY, STATE # round 0 2297 add $0x30, TKEYP 2298 cmp $24, KLEN 2299 jb .Ldec128 2300 lea 0x20(TKEYP), TKEYP 2301 je .Ldec192 2302 add $0x20, TKEYP 2303 movaps -0x60(TKEYP), KEY 2304 AESDEC KEY STATE 2305 movaps -0x50(TKEYP), KEY 2306 AESDEC KEY STATE 2307.align 4 2308.Ldec192: 2309 movaps -0x40(TKEYP), KEY 2310 AESDEC KEY STATE 2311 movaps -0x30(TKEYP), KEY 2312 AESDEC KEY STATE 2313.align 4 2314.Ldec128: 2315 movaps -0x20(TKEYP), KEY 2316 AESDEC KEY STATE 2317 movaps -0x10(TKEYP), KEY 2318 AESDEC KEY STATE 2319 movaps (TKEYP), KEY 2320 AESDEC KEY STATE 2321 movaps 0x10(TKEYP), KEY 2322 AESDEC KEY STATE 2323 movaps 0x20(TKEYP), KEY 2324 AESDEC KEY STATE 2325 movaps 0x30(TKEYP), KEY 2326 AESDEC KEY STATE 2327 movaps 0x40(TKEYP), KEY 2328 AESDEC KEY STATE 2329 movaps 0x50(TKEYP), KEY 2330 AESDEC KEY STATE 2331 movaps 0x60(TKEYP), KEY 2332 AESDEC KEY STATE 2333 movaps 0x70(TKEYP), KEY 2334 AESDECLAST KEY STATE 2335 ret 2336ENDPROC(_aesni_dec1) 2337 2338/* 2339 * _aesni_dec4: internal ABI 2340 * input: 2341 * KEYP: key struct pointer 2342 * KLEN: key length 2343 * STATE1: initial state (input) 2344 * STATE2 2345 * STATE3 2346 * STATE4 2347 * output: 2348 * STATE1: finial state (output) 2349 * STATE2 2350 * STATE3 2351 * STATE4 2352 * changed: 2353 * KEY 2354 * TKEYP (T1) 2355 */ 2356.align 4 2357_aesni_dec4: 2358 movaps (KEYP), KEY # key 2359 mov KEYP, TKEYP 2360 pxor KEY, STATE1 # round 0 2361 pxor KEY, STATE2 2362 pxor KEY, STATE3 2363 pxor KEY, STATE4 2364 add $0x30, TKEYP 2365 cmp $24, KLEN 2366 jb .L4dec128 2367 lea 0x20(TKEYP), TKEYP 2368 je .L4dec192 2369 add $0x20, TKEYP 2370 movaps -0x60(TKEYP), KEY 2371 AESDEC KEY STATE1 2372 AESDEC KEY STATE2 2373 AESDEC KEY STATE3 2374 AESDEC KEY STATE4 2375 movaps -0x50(TKEYP), KEY 2376 AESDEC KEY STATE1 2377 AESDEC KEY STATE2 2378 AESDEC KEY STATE3 2379 AESDEC KEY STATE4 2380.align 4 2381.L4dec192: 2382 movaps -0x40(TKEYP), KEY 2383 AESDEC KEY STATE1 2384 AESDEC KEY STATE2 2385 AESDEC KEY STATE3 2386 AESDEC KEY STATE4 2387 movaps -0x30(TKEYP), KEY 2388 AESDEC KEY STATE1 2389 AESDEC KEY STATE2 2390 AESDEC KEY STATE3 2391 AESDEC KEY STATE4 2392.align 4 2393.L4dec128: 2394 movaps -0x20(TKEYP), KEY 2395 AESDEC KEY STATE1 2396 AESDEC KEY STATE2 2397 AESDEC KEY STATE3 2398 AESDEC KEY STATE4 2399 movaps -0x10(TKEYP), KEY 2400 AESDEC KEY STATE1 2401 AESDEC KEY STATE2 2402 AESDEC KEY STATE3 2403 AESDEC KEY STATE4 2404 movaps (TKEYP), KEY 2405 AESDEC KEY STATE1 2406 AESDEC KEY STATE2 2407 AESDEC KEY STATE3 2408 AESDEC KEY STATE4 2409 movaps 0x10(TKEYP), KEY 2410 AESDEC KEY STATE1 2411 AESDEC KEY STATE2 2412 AESDEC KEY STATE3 2413 AESDEC KEY STATE4 2414 movaps 0x20(TKEYP), KEY 2415 AESDEC KEY STATE1 2416 AESDEC KEY STATE2 2417 AESDEC KEY STATE3 2418 AESDEC KEY STATE4 2419 movaps 0x30(TKEYP), KEY 2420 AESDEC KEY STATE1 2421 AESDEC KEY STATE2 2422 AESDEC KEY STATE3 2423 AESDEC KEY STATE4 2424 movaps 0x40(TKEYP), KEY 2425 AESDEC KEY STATE1 2426 AESDEC KEY STATE2 2427 AESDEC KEY STATE3 2428 AESDEC KEY STATE4 2429 movaps 0x50(TKEYP), KEY 2430 AESDEC KEY STATE1 2431 AESDEC KEY STATE2 2432 AESDEC KEY STATE3 2433 AESDEC KEY STATE4 2434 movaps 0x60(TKEYP), KEY 2435 AESDEC KEY STATE1 2436 AESDEC KEY STATE2 2437 AESDEC KEY STATE3 2438 AESDEC KEY STATE4 2439 movaps 0x70(TKEYP), KEY 2440 AESDECLAST KEY STATE1 # last round 2441 AESDECLAST KEY STATE2 2442 AESDECLAST KEY STATE3 2443 AESDECLAST KEY STATE4 2444 ret 2445ENDPROC(_aesni_dec4) 2446 2447/* 2448 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2449 * size_t len) 2450 */ 2451ENTRY(aesni_ecb_enc) 2452 FRAME_BEGIN 2453#ifndef __x86_64__ 2454 pushl LEN 2455 pushl KEYP 2456 pushl KLEN 2457 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2458 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2459 movl (FRAME_OFFSET+24)(%esp), INP # src 2460 movl (FRAME_OFFSET+28)(%esp), LEN # len 2461#endif 2462 test LEN, LEN # check length 2463 jz .Lecb_enc_ret 2464 mov 480(KEYP), KLEN 2465 cmp $16, LEN 2466 jb .Lecb_enc_ret 2467 cmp $64, LEN 2468 jb .Lecb_enc_loop1 2469.align 4 2470.Lecb_enc_loop4: 2471 movups (INP), STATE1 2472 movups 0x10(INP), STATE2 2473 movups 0x20(INP), STATE3 2474 movups 0x30(INP), STATE4 2475 call _aesni_enc4 2476 movups STATE1, (OUTP) 2477 movups STATE2, 0x10(OUTP) 2478 movups STATE3, 0x20(OUTP) 2479 movups STATE4, 0x30(OUTP) 2480 sub $64, LEN 2481 add $64, INP 2482 add $64, OUTP 2483 cmp $64, LEN 2484 jge .Lecb_enc_loop4 2485 cmp $16, LEN 2486 jb .Lecb_enc_ret 2487.align 4 2488.Lecb_enc_loop1: 2489 movups (INP), STATE1 2490 call _aesni_enc1 2491 movups STATE1, (OUTP) 2492 sub $16, LEN 2493 add $16, INP 2494 add $16, OUTP 2495 cmp $16, LEN 2496 jge .Lecb_enc_loop1 2497.Lecb_enc_ret: 2498#ifndef __x86_64__ 2499 popl KLEN 2500 popl KEYP 2501 popl LEN 2502#endif 2503 FRAME_END 2504 ret 2505ENDPROC(aesni_ecb_enc) 2506 2507/* 2508 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2509 * size_t len); 2510 */ 2511ENTRY(aesni_ecb_dec) 2512 FRAME_BEGIN 2513#ifndef __x86_64__ 2514 pushl LEN 2515 pushl KEYP 2516 pushl KLEN 2517 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2518 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2519 movl (FRAME_OFFSET+24)(%esp), INP # src 2520 movl (FRAME_OFFSET+28)(%esp), LEN # len 2521#endif 2522 test LEN, LEN 2523 jz .Lecb_dec_ret 2524 mov 480(KEYP), KLEN 2525 add $240, KEYP 2526 cmp $16, LEN 2527 jb .Lecb_dec_ret 2528 cmp $64, LEN 2529 jb .Lecb_dec_loop1 2530.align 4 2531.Lecb_dec_loop4: 2532 movups (INP), STATE1 2533 movups 0x10(INP), STATE2 2534 movups 0x20(INP), STATE3 2535 movups 0x30(INP), STATE4 2536 call _aesni_dec4 2537 movups STATE1, (OUTP) 2538 movups STATE2, 0x10(OUTP) 2539 movups STATE3, 0x20(OUTP) 2540 movups STATE4, 0x30(OUTP) 2541 sub $64, LEN 2542 add $64, INP 2543 add $64, OUTP 2544 cmp $64, LEN 2545 jge .Lecb_dec_loop4 2546 cmp $16, LEN 2547 jb .Lecb_dec_ret 2548.align 4 2549.Lecb_dec_loop1: 2550 movups (INP), STATE1 2551 call _aesni_dec1 2552 movups STATE1, (OUTP) 2553 sub $16, LEN 2554 add $16, INP 2555 add $16, OUTP 2556 cmp $16, LEN 2557 jge .Lecb_dec_loop1 2558.Lecb_dec_ret: 2559#ifndef __x86_64__ 2560 popl KLEN 2561 popl KEYP 2562 popl LEN 2563#endif 2564 FRAME_END 2565 ret 2566ENDPROC(aesni_ecb_dec) 2567 2568/* 2569 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2570 * size_t len, u8 *iv) 2571 */ 2572ENTRY(aesni_cbc_enc) 2573 FRAME_BEGIN 2574#ifndef __x86_64__ 2575 pushl IVP 2576 pushl LEN 2577 pushl KEYP 2578 pushl KLEN 2579 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2580 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2581 movl (FRAME_OFFSET+28)(%esp), INP # src 2582 movl (FRAME_OFFSET+32)(%esp), LEN # len 2583 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2584#endif 2585 cmp $16, LEN 2586 jb .Lcbc_enc_ret 2587 mov 480(KEYP), KLEN 2588 movups (IVP), STATE # load iv as initial state 2589.align 4 2590.Lcbc_enc_loop: 2591 movups (INP), IN # load input 2592 pxor IN, STATE 2593 call _aesni_enc1 2594 movups STATE, (OUTP) # store output 2595 sub $16, LEN 2596 add $16, INP 2597 add $16, OUTP 2598 cmp $16, LEN 2599 jge .Lcbc_enc_loop 2600 movups STATE, (IVP) 2601.Lcbc_enc_ret: 2602#ifndef __x86_64__ 2603 popl KLEN 2604 popl KEYP 2605 popl LEN 2606 popl IVP 2607#endif 2608 FRAME_END 2609 ret 2610ENDPROC(aesni_cbc_enc) 2611 2612/* 2613 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2614 * size_t len, u8 *iv) 2615 */ 2616ENTRY(aesni_cbc_dec) 2617 FRAME_BEGIN 2618#ifndef __x86_64__ 2619 pushl IVP 2620 pushl LEN 2621 pushl KEYP 2622 pushl KLEN 2623 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2624 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2625 movl (FRAME_OFFSET+28)(%esp), INP # src 2626 movl (FRAME_OFFSET+32)(%esp), LEN # len 2627 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2628#endif 2629 cmp $16, LEN 2630 jb .Lcbc_dec_just_ret 2631 mov 480(KEYP), KLEN 2632 add $240, KEYP 2633 movups (IVP), IV 2634 cmp $64, LEN 2635 jb .Lcbc_dec_loop1 2636.align 4 2637.Lcbc_dec_loop4: 2638 movups (INP), IN1 2639 movaps IN1, STATE1 2640 movups 0x10(INP), IN2 2641 movaps IN2, STATE2 2642#ifdef __x86_64__ 2643 movups 0x20(INP), IN3 2644 movaps IN3, STATE3 2645 movups 0x30(INP), IN4 2646 movaps IN4, STATE4 2647#else 2648 movups 0x20(INP), IN1 2649 movaps IN1, STATE3 2650 movups 0x30(INP), IN2 2651 movaps IN2, STATE4 2652#endif 2653 call _aesni_dec4 2654 pxor IV, STATE1 2655#ifdef __x86_64__ 2656 pxor IN1, STATE2 2657 pxor IN2, STATE3 2658 pxor IN3, STATE4 2659 movaps IN4, IV 2660#else 2661 pxor IN1, STATE4 2662 movaps IN2, IV 2663 movups (INP), IN1 2664 pxor IN1, STATE2 2665 movups 0x10(INP), IN2 2666 pxor IN2, STATE3 2667#endif 2668 movups STATE1, (OUTP) 2669 movups STATE2, 0x10(OUTP) 2670 movups STATE3, 0x20(OUTP) 2671 movups STATE4, 0x30(OUTP) 2672 sub $64, LEN 2673 add $64, INP 2674 add $64, OUTP 2675 cmp $64, LEN 2676 jge .Lcbc_dec_loop4 2677 cmp $16, LEN 2678 jb .Lcbc_dec_ret 2679.align 4 2680.Lcbc_dec_loop1: 2681 movups (INP), IN 2682 movaps IN, STATE 2683 call _aesni_dec1 2684 pxor IV, STATE 2685 movups STATE, (OUTP) 2686 movaps IN, IV 2687 sub $16, LEN 2688 add $16, INP 2689 add $16, OUTP 2690 cmp $16, LEN 2691 jge .Lcbc_dec_loop1 2692.Lcbc_dec_ret: 2693 movups IV, (IVP) 2694.Lcbc_dec_just_ret: 2695#ifndef __x86_64__ 2696 popl KLEN 2697 popl KEYP 2698 popl LEN 2699 popl IVP 2700#endif 2701 FRAME_END 2702 ret 2703ENDPROC(aesni_cbc_dec) 2704 2705#ifdef __x86_64__ 2706.pushsection .rodata 2707.align 16 2708.Lbswap_mask: 2709 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2710.popsection 2711 2712/* 2713 * _aesni_inc_init: internal ABI 2714 * setup registers used by _aesni_inc 2715 * input: 2716 * IV 2717 * output: 2718 * CTR: == IV, in little endian 2719 * TCTR_LOW: == lower qword of CTR 2720 * INC: == 1, in little endian 2721 * BSWAP_MASK == endian swapping mask 2722 */ 2723.align 4 2724_aesni_inc_init: 2725 movaps .Lbswap_mask, BSWAP_MASK 2726 movaps IV, CTR 2727 PSHUFB_XMM BSWAP_MASK CTR 2728 mov $1, TCTR_LOW 2729 MOVQ_R64_XMM TCTR_LOW INC 2730 MOVQ_R64_XMM CTR TCTR_LOW 2731 ret 2732ENDPROC(_aesni_inc_init) 2733 2734/* 2735 * _aesni_inc: internal ABI 2736 * Increase IV by 1, IV is in big endian 2737 * input: 2738 * IV 2739 * CTR: == IV, in little endian 2740 * TCTR_LOW: == lower qword of CTR 2741 * INC: == 1, in little endian 2742 * BSWAP_MASK == endian swapping mask 2743 * output: 2744 * IV: Increase by 1 2745 * changed: 2746 * CTR: == output IV, in little endian 2747 * TCTR_LOW: == lower qword of CTR 2748 */ 2749.align 4 2750_aesni_inc: 2751 paddq INC, CTR 2752 add $1, TCTR_LOW 2753 jnc .Linc_low 2754 pslldq $8, INC 2755 paddq INC, CTR 2756 psrldq $8, INC 2757.Linc_low: 2758 movaps CTR, IV 2759 PSHUFB_XMM BSWAP_MASK IV 2760 ret 2761ENDPROC(_aesni_inc) 2762 2763/* 2764 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2765 * size_t len, u8 *iv) 2766 */ 2767ENTRY(aesni_ctr_enc) 2768 FRAME_BEGIN 2769 cmp $16, LEN 2770 jb .Lctr_enc_just_ret 2771 mov 480(KEYP), KLEN 2772 movups (IVP), IV 2773 call _aesni_inc_init 2774 cmp $64, LEN 2775 jb .Lctr_enc_loop1 2776.align 4 2777.Lctr_enc_loop4: 2778 movaps IV, STATE1 2779 call _aesni_inc 2780 movups (INP), IN1 2781 movaps IV, STATE2 2782 call _aesni_inc 2783 movups 0x10(INP), IN2 2784 movaps IV, STATE3 2785 call _aesni_inc 2786 movups 0x20(INP), IN3 2787 movaps IV, STATE4 2788 call _aesni_inc 2789 movups 0x30(INP), IN4 2790 call _aesni_enc4 2791 pxor IN1, STATE1 2792 movups STATE1, (OUTP) 2793 pxor IN2, STATE2 2794 movups STATE2, 0x10(OUTP) 2795 pxor IN3, STATE3 2796 movups STATE3, 0x20(OUTP) 2797 pxor IN4, STATE4 2798 movups STATE4, 0x30(OUTP) 2799 sub $64, LEN 2800 add $64, INP 2801 add $64, OUTP 2802 cmp $64, LEN 2803 jge .Lctr_enc_loop4 2804 cmp $16, LEN 2805 jb .Lctr_enc_ret 2806.align 4 2807.Lctr_enc_loop1: 2808 movaps IV, STATE 2809 call _aesni_inc 2810 movups (INP), IN 2811 call _aesni_enc1 2812 pxor IN, STATE 2813 movups STATE, (OUTP) 2814 sub $16, LEN 2815 add $16, INP 2816 add $16, OUTP 2817 cmp $16, LEN 2818 jge .Lctr_enc_loop1 2819.Lctr_enc_ret: 2820 movups IV, (IVP) 2821.Lctr_enc_just_ret: 2822 FRAME_END 2823 ret 2824ENDPROC(aesni_ctr_enc) 2825 2826/* 2827 * _aesni_gf128mul_x_ble: internal ABI 2828 * Multiply in GF(2^128) for XTS IVs 2829 * input: 2830 * IV: current IV 2831 * GF128MUL_MASK == mask with 0x87 and 0x01 2832 * output: 2833 * IV: next IV 2834 * changed: 2835 * CTR: == temporary value 2836 */ 2837#define _aesni_gf128mul_x_ble() \ 2838 pshufd $0x13, IV, CTR; \ 2839 paddq IV, IV; \ 2840 psrad $31, CTR; \ 2841 pand GF128MUL_MASK, CTR; \ 2842 pxor CTR, IV; 2843 2844/* 2845 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2846 * bool enc, u8 *iv) 2847 */ 2848ENTRY(aesni_xts_crypt8) 2849 FRAME_BEGIN 2850 cmpb $0, %cl 2851 movl $0, %ecx 2852 movl $240, %r10d 2853 leaq _aesni_enc4, %r11 2854 leaq _aesni_dec4, %rax 2855 cmovel %r10d, %ecx 2856 cmoveq %rax, %r11 2857 2858 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2859 movups (IVP), IV 2860 2861 mov 480(KEYP), KLEN 2862 addq %rcx, KEYP 2863 2864 movdqa IV, STATE1 2865 movdqu 0x00(INP), INC 2866 pxor INC, STATE1 2867 movdqu IV, 0x00(OUTP) 2868 2869 _aesni_gf128mul_x_ble() 2870 movdqa IV, STATE2 2871 movdqu 0x10(INP), INC 2872 pxor INC, STATE2 2873 movdqu IV, 0x10(OUTP) 2874 2875 _aesni_gf128mul_x_ble() 2876 movdqa IV, STATE3 2877 movdqu 0x20(INP), INC 2878 pxor INC, STATE3 2879 movdqu IV, 0x20(OUTP) 2880 2881 _aesni_gf128mul_x_ble() 2882 movdqa IV, STATE4 2883 movdqu 0x30(INP), INC 2884 pxor INC, STATE4 2885 movdqu IV, 0x30(OUTP) 2886 2887 call *%r11 2888 2889 movdqu 0x00(OUTP), INC 2890 pxor INC, STATE1 2891 movdqu STATE1, 0x00(OUTP) 2892 2893 _aesni_gf128mul_x_ble() 2894 movdqa IV, STATE1 2895 movdqu 0x40(INP), INC 2896 pxor INC, STATE1 2897 movdqu IV, 0x40(OUTP) 2898 2899 movdqu 0x10(OUTP), INC 2900 pxor INC, STATE2 2901 movdqu STATE2, 0x10(OUTP) 2902 2903 _aesni_gf128mul_x_ble() 2904 movdqa IV, STATE2 2905 movdqu 0x50(INP), INC 2906 pxor INC, STATE2 2907 movdqu IV, 0x50(OUTP) 2908 2909 movdqu 0x20(OUTP), INC 2910 pxor INC, STATE3 2911 movdqu STATE3, 0x20(OUTP) 2912 2913 _aesni_gf128mul_x_ble() 2914 movdqa IV, STATE3 2915 movdqu 0x60(INP), INC 2916 pxor INC, STATE3 2917 movdqu IV, 0x60(OUTP) 2918 2919 movdqu 0x30(OUTP), INC 2920 pxor INC, STATE4 2921 movdqu STATE4, 0x30(OUTP) 2922 2923 _aesni_gf128mul_x_ble() 2924 movdqa IV, STATE4 2925 movdqu 0x70(INP), INC 2926 pxor INC, STATE4 2927 movdqu IV, 0x70(OUTP) 2928 2929 _aesni_gf128mul_x_ble() 2930 movups IV, (IVP) 2931 2932 call *%r11 2933 2934 movdqu 0x40(OUTP), INC 2935 pxor INC, STATE1 2936 movdqu STATE1, 0x40(OUTP) 2937 2938 movdqu 0x50(OUTP), INC 2939 pxor INC, STATE2 2940 movdqu STATE2, 0x50(OUTP) 2941 2942 movdqu 0x60(OUTP), INC 2943 pxor INC, STATE3 2944 movdqu STATE3, 0x60(OUTP) 2945 2946 movdqu 0x70(OUTP), INC 2947 pxor INC, STATE4 2948 movdqu STATE4, 0x70(OUTP) 2949 2950 FRAME_END 2951 ret 2952ENDPROC(aesni_xts_crypt8) 2953 2954#endif 2955