1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/frame.h> 35 36/* 37 * The following macros are used to move an (un)aligned 16 byte value to/from 38 * an XMM register. This can done for either FP or integer values, for FP use 39 * movaps (move aligned packed single) or integer use movdqa (move double quad 40 * aligned). It doesn't make a performance difference which instruction is used 41 * since Nehalem (original Core i7) was released. However, the movaps is a byte 42 * shorter, so that is the one we'll use for now. (same for unaligned). 43 */ 44#define MOVADQ movaps 45#define MOVUDQ movups 46 47#ifdef __x86_64__ 48 49# constants in mergeable sections, linker can reorder and merge 50.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 51.align 16 52.Lgf128mul_x_ble_mask: 53 .octa 0x00000000000000010000000000000087 54.section .rodata.cst16.POLY, "aM", @progbits, 16 55.align 16 56POLY: .octa 0xC2000000000000000000000000000001 57.section .rodata.cst16.TWOONE, "aM", @progbits, 16 58.align 16 59TWOONE: .octa 0x00000001000000000000000000000001 60 61.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 62.align 16 63SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 64.section .rodata.cst16.MASK1, "aM", @progbits, 16 65.align 16 66MASK1: .octa 0x0000000000000000ffffffffffffffff 67.section .rodata.cst16.MASK2, "aM", @progbits, 16 68.align 16 69MASK2: .octa 0xffffffffffffffff0000000000000000 70.section .rodata.cst16.ONE, "aM", @progbits, 16 71.align 16 72ONE: .octa 0x00000000000000000000000000000001 73.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 74.align 16 75F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 76.section .rodata.cst16.dec, "aM", @progbits, 16 77.align 16 78dec: .octa 0x1 79.section .rodata.cst16.enc, "aM", @progbits, 16 80.align 16 81enc: .octa 0x2 82 83# order of these constants should not change. 84# more specifically, ALL_F should follow SHIFT_MASK, 85# and zero should follow ALL_F 86.section .rodata, "a", @progbits 87.align 16 88SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 89ALL_F: .octa 0xffffffffffffffffffffffffffffffff 90 .octa 0x00000000000000000000000000000000 91 92 93.text 94 95 96#define STACK_OFFSET 8*3 97#define HashKey 16*0 // store HashKey <<1 mod poly here 98#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 99#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 100#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 101#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 102 // bits of HashKey <<1 mod poly here 103 //(for Karatsuba purposes) 104#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 105 // bits of HashKey^2 <<1 mod poly here 106 // (for Karatsuba purposes) 107#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 108 // bits of HashKey^3 <<1 mod poly here 109 // (for Karatsuba purposes) 110#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 111 // bits of HashKey^4 <<1 mod poly here 112 // (for Karatsuba purposes) 113#define VARIABLE_OFFSET 16*8 114 115#define arg1 rdi 116#define arg2 rsi 117#define arg3 rdx 118#define arg4 rcx 119#define arg5 r8 120#define arg6 r9 121#define arg7 STACK_OFFSET+8(%r14) 122#define arg8 STACK_OFFSET+16(%r14) 123#define arg9 STACK_OFFSET+24(%r14) 124#define arg10 STACK_OFFSET+32(%r14) 125#define keysize 2*15*16(%arg1) 126#endif 127 128 129#define STATE1 %xmm0 130#define STATE2 %xmm4 131#define STATE3 %xmm5 132#define STATE4 %xmm6 133#define STATE STATE1 134#define IN1 %xmm1 135#define IN2 %xmm7 136#define IN3 %xmm8 137#define IN4 %xmm9 138#define IN IN1 139#define KEY %xmm2 140#define IV %xmm3 141 142#define BSWAP_MASK %xmm10 143#define CTR %xmm11 144#define INC %xmm12 145 146#define GF128MUL_MASK %xmm10 147 148#ifdef __x86_64__ 149#define AREG %rax 150#define KEYP %rdi 151#define OUTP %rsi 152#define UKEYP OUTP 153#define INP %rdx 154#define LEN %rcx 155#define IVP %r8 156#define KLEN %r9d 157#define T1 %r10 158#define TKEYP T1 159#define T2 %r11 160#define TCTR_LOW T2 161#else 162#define AREG %eax 163#define KEYP %edi 164#define OUTP AREG 165#define UKEYP OUTP 166#define INP %edx 167#define LEN %esi 168#define IVP %ebp 169#define KLEN %ebx 170#define T1 %ecx 171#define TKEYP T1 172#endif 173 174 175#ifdef __x86_64__ 176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 177* 178* 179* Input: A and B (128-bits each, bit-reflected) 180* Output: C = A*B*x mod poly, (i.e. >>1 ) 181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 183* 184*/ 185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 186 movdqa \GH, \TMP1 187 pshufd $78, \GH, \TMP2 188 pshufd $78, \HK, \TMP3 189 pxor \GH, \TMP2 # TMP2 = a1+a0 190 pxor \HK, \TMP3 # TMP3 = b1+b0 191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 194 pxor \GH, \TMP2 195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 196 movdqa \TMP2, \TMP3 197 pslldq $8, \TMP3 # left shift TMP3 2 DWs 198 psrldq $8, \TMP2 # right shift TMP2 2 DWs 199 pxor \TMP3, \GH 200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 201 202 # first phase of the reduction 203 204 movdqa \GH, \TMP2 205 movdqa \GH, \TMP3 206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 207 # in in order to perform 208 # independent shifts 209 pslld $31, \TMP2 # packed right shift <<31 210 pslld $30, \TMP3 # packed right shift <<30 211 pslld $25, \TMP4 # packed right shift <<25 212 pxor \TMP3, \TMP2 # xor the shifted versions 213 pxor \TMP4, \TMP2 214 movdqa \TMP2, \TMP5 215 psrldq $4, \TMP5 # right shift TMP5 1 DW 216 pslldq $12, \TMP2 # left shift TMP2 3 DWs 217 pxor \TMP2, \GH 218 219 # second phase of the reduction 220 221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 222 # in in order to perform 223 # independent shifts 224 movdqa \GH,\TMP3 225 movdqa \GH,\TMP4 226 psrld $1,\TMP2 # packed left shift >>1 227 psrld $2,\TMP3 # packed left shift >>2 228 psrld $7,\TMP4 # packed left shift >>7 229 pxor \TMP3,\TMP2 # xor the shifted versions 230 pxor \TMP4,\TMP2 231 pxor \TMP5, \TMP2 232 pxor \TMP2, \GH 233 pxor \TMP1, \GH # result is in TMP1 234.endm 235 236/* 237* if a = number of total plaintext bytes 238* b = floor(a/16) 239* num_initial_blocks = b mod 4 240* encrypt the initial num_initial_blocks blocks and apply ghash on 241* the ciphertext 242* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 243* are clobbered 244* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 245*/ 246 247 248.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 249XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 250 MOVADQ SHUF_MASK(%rip), %xmm14 251 mov arg7, %r10 # %r10 = AAD 252 mov arg8, %r12 # %r12 = aadLen 253 mov %r12, %r11 254 pxor %xmm\i, %xmm\i 255 256_get_AAD_loop\num_initial_blocks\operation: 257 movd (%r10), \TMP1 258 pslldq $12, \TMP1 259 psrldq $4, %xmm\i 260 pxor \TMP1, %xmm\i 261 add $4, %r10 262 sub $4, %r12 263 jne _get_AAD_loop\num_initial_blocks\operation 264 265 cmp $16, %r11 266 je _get_AAD_loop2_done\num_initial_blocks\operation 267 268 mov $16, %r12 269_get_AAD_loop2\num_initial_blocks\operation: 270 psrldq $4, %xmm\i 271 sub $4, %r12 272 cmp %r11, %r12 273 jne _get_AAD_loop2\num_initial_blocks\operation 274 275_get_AAD_loop2_done\num_initial_blocks\operation: 276 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 277 278 xor %r11, %r11 # initialise the data pointer offset as zero 279 280 # start AES for num_initial_blocks blocks 281 282 mov %arg5, %rax # %rax = *Y0 283 movdqu (%rax), \XMM0 # XMM0 = Y0 284 PSHUFB_XMM %xmm14, \XMM0 285 286.if (\i == 5) || (\i == 6) || (\i == 7) 287 MOVADQ ONE(%RIP),\TMP1 288 MOVADQ (%arg1),\TMP2 289.irpc index, \i_seq 290 paddd \TMP1, \XMM0 # INCR Y0 291 movdqa \XMM0, %xmm\index 292 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 293 pxor \TMP2, %xmm\index 294.endr 295 lea 0x10(%arg1),%r10 296 mov keysize,%eax 297 shr $2,%eax # 128->4, 192->6, 256->8 298 add $5,%eax # 128->9, 192->11, 256->13 299 300aes_loop_initial_dec\num_initial_blocks: 301 MOVADQ (%r10),\TMP1 302.irpc index, \i_seq 303 AESENC \TMP1, %xmm\index 304.endr 305 add $16,%r10 306 sub $1,%eax 307 jnz aes_loop_initial_dec\num_initial_blocks 308 309 MOVADQ (%r10), \TMP1 310.irpc index, \i_seq 311 AESENCLAST \TMP1, %xmm\index # Last Round 312.endr 313.irpc index, \i_seq 314 movdqu (%arg3 , %r11, 1), \TMP1 315 pxor \TMP1, %xmm\index 316 movdqu %xmm\index, (%arg2 , %r11, 1) 317 # write back plaintext/ciphertext for num_initial_blocks 318 add $16, %r11 319 320 movdqa \TMP1, %xmm\index 321 PSHUFB_XMM %xmm14, %xmm\index 322 # prepare plaintext/ciphertext for GHASH computation 323.endr 324.endif 325 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 326 # apply GHASH on num_initial_blocks blocks 327 328.if \i == 5 329 pxor %xmm5, %xmm6 330 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 331 pxor %xmm6, %xmm7 332 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 333 pxor %xmm7, %xmm8 334 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 335.elseif \i == 6 336 pxor %xmm6, %xmm7 337 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 338 pxor %xmm7, %xmm8 339 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 340.elseif \i == 7 341 pxor %xmm7, %xmm8 342 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 343.endif 344 cmp $64, %r13 345 jl _initial_blocks_done\num_initial_blocks\operation 346 # no need for precomputed values 347/* 348* 349* Precomputations for HashKey parallel with encryption of first 4 blocks. 350* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 351*/ 352 MOVADQ ONE(%rip), \TMP1 353 paddd \TMP1, \XMM0 # INCR Y0 354 MOVADQ \XMM0, \XMM1 355 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 356 357 paddd \TMP1, \XMM0 # INCR Y0 358 MOVADQ \XMM0, \XMM2 359 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 360 361 paddd \TMP1, \XMM0 # INCR Y0 362 MOVADQ \XMM0, \XMM3 363 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 364 365 paddd \TMP1, \XMM0 # INCR Y0 366 MOVADQ \XMM0, \XMM4 367 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 368 369 MOVADQ 0(%arg1),\TMP1 370 pxor \TMP1, \XMM1 371 pxor \TMP1, \XMM2 372 pxor \TMP1, \XMM3 373 pxor \TMP1, \XMM4 374 movdqa \TMP3, \TMP5 375 pshufd $78, \TMP3, \TMP1 376 pxor \TMP3, \TMP1 377 movdqa \TMP1, HashKey_k(%rsp) 378 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 379# TMP5 = HashKey^2<<1 (mod poly) 380 movdqa \TMP5, HashKey_2(%rsp) 381# HashKey_2 = HashKey^2<<1 (mod poly) 382 pshufd $78, \TMP5, \TMP1 383 pxor \TMP5, \TMP1 384 movdqa \TMP1, HashKey_2_k(%rsp) 385.irpc index, 1234 # do 4 rounds 386 movaps 0x10*\index(%arg1), \TMP1 387 AESENC \TMP1, \XMM1 388 AESENC \TMP1, \XMM2 389 AESENC \TMP1, \XMM3 390 AESENC \TMP1, \XMM4 391.endr 392 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 393# TMP5 = HashKey^3<<1 (mod poly) 394 movdqa \TMP5, HashKey_3(%rsp) 395 pshufd $78, \TMP5, \TMP1 396 pxor \TMP5, \TMP1 397 movdqa \TMP1, HashKey_3_k(%rsp) 398.irpc index, 56789 # do next 5 rounds 399 movaps 0x10*\index(%arg1), \TMP1 400 AESENC \TMP1, \XMM1 401 AESENC \TMP1, \XMM2 402 AESENC \TMP1, \XMM3 403 AESENC \TMP1, \XMM4 404.endr 405 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 406# TMP5 = HashKey^3<<1 (mod poly) 407 movdqa \TMP5, HashKey_4(%rsp) 408 pshufd $78, \TMP5, \TMP1 409 pxor \TMP5, \TMP1 410 movdqa \TMP1, HashKey_4_k(%rsp) 411 lea 0xa0(%arg1),%r10 412 mov keysize,%eax 413 shr $2,%eax # 128->4, 192->6, 256->8 414 sub $4,%eax # 128->0, 192->2, 256->4 415 jz aes_loop_pre_dec_done\num_initial_blocks 416 417aes_loop_pre_dec\num_initial_blocks: 418 MOVADQ (%r10),\TMP2 419.irpc index, 1234 420 AESENC \TMP2, %xmm\index 421.endr 422 add $16,%r10 423 sub $1,%eax 424 jnz aes_loop_pre_dec\num_initial_blocks 425 426aes_loop_pre_dec_done\num_initial_blocks: 427 MOVADQ (%r10), \TMP2 428 AESENCLAST \TMP2, \XMM1 429 AESENCLAST \TMP2, \XMM2 430 AESENCLAST \TMP2, \XMM3 431 AESENCLAST \TMP2, \XMM4 432 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 433 pxor \TMP1, \XMM1 434 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 435 movdqa \TMP1, \XMM1 436 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 437 pxor \TMP1, \XMM2 438 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 439 movdqa \TMP1, \XMM2 440 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 441 pxor \TMP1, \XMM3 442 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 443 movdqa \TMP1, \XMM3 444 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 445 pxor \TMP1, \XMM4 446 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 447 movdqa \TMP1, \XMM4 448 add $64, %r11 449 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 450 pxor \XMMDst, \XMM1 451# combine GHASHed value with the corresponding ciphertext 452 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 453 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 454 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 455 456_initial_blocks_done\num_initial_blocks\operation: 457 458.endm 459 460 461/* 462* if a = number of total plaintext bytes 463* b = floor(a/16) 464* num_initial_blocks = b mod 4 465* encrypt the initial num_initial_blocks blocks and apply ghash on 466* the ciphertext 467* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 468* are clobbered 469* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 470*/ 471 472 473.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 474XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 475 MOVADQ SHUF_MASK(%rip), %xmm14 476 mov arg7, %r10 # %r10 = AAD 477 mov arg8, %r12 # %r12 = aadLen 478 mov %r12, %r11 479 pxor %xmm\i, %xmm\i 480_get_AAD_loop\num_initial_blocks\operation: 481 movd (%r10), \TMP1 482 pslldq $12, \TMP1 483 psrldq $4, %xmm\i 484 pxor \TMP1, %xmm\i 485 add $4, %r10 486 sub $4, %r12 487 jne _get_AAD_loop\num_initial_blocks\operation 488 cmp $16, %r11 489 je _get_AAD_loop2_done\num_initial_blocks\operation 490 mov $16, %r12 491_get_AAD_loop2\num_initial_blocks\operation: 492 psrldq $4, %xmm\i 493 sub $4, %r12 494 cmp %r11, %r12 495 jne _get_AAD_loop2\num_initial_blocks\operation 496_get_AAD_loop2_done\num_initial_blocks\operation: 497 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 498 499 xor %r11, %r11 # initialise the data pointer offset as zero 500 501 # start AES for num_initial_blocks blocks 502 503 mov %arg5, %rax # %rax = *Y0 504 movdqu (%rax), \XMM0 # XMM0 = Y0 505 PSHUFB_XMM %xmm14, \XMM0 506 507.if (\i == 5) || (\i == 6) || (\i == 7) 508 509 MOVADQ ONE(%RIP),\TMP1 510 MOVADQ 0(%arg1),\TMP2 511.irpc index, \i_seq 512 paddd \TMP1, \XMM0 # INCR Y0 513 MOVADQ \XMM0, %xmm\index 514 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 515 pxor \TMP2, %xmm\index 516.endr 517 lea 0x10(%arg1),%r10 518 mov keysize,%eax 519 shr $2,%eax # 128->4, 192->6, 256->8 520 add $5,%eax # 128->9, 192->11, 256->13 521 522aes_loop_initial_enc\num_initial_blocks: 523 MOVADQ (%r10),\TMP1 524.irpc index, \i_seq 525 AESENC \TMP1, %xmm\index 526.endr 527 add $16,%r10 528 sub $1,%eax 529 jnz aes_loop_initial_enc\num_initial_blocks 530 531 MOVADQ (%r10), \TMP1 532.irpc index, \i_seq 533 AESENCLAST \TMP1, %xmm\index # Last Round 534.endr 535.irpc index, \i_seq 536 movdqu (%arg3 , %r11, 1), \TMP1 537 pxor \TMP1, %xmm\index 538 movdqu %xmm\index, (%arg2 , %r11, 1) 539 # write back plaintext/ciphertext for num_initial_blocks 540 add $16, %r11 541 PSHUFB_XMM %xmm14, %xmm\index 542 543 # prepare plaintext/ciphertext for GHASH computation 544.endr 545.endif 546 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 547 # apply GHASH on num_initial_blocks blocks 548 549.if \i == 5 550 pxor %xmm5, %xmm6 551 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 552 pxor %xmm6, %xmm7 553 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 554 pxor %xmm7, %xmm8 555 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 556.elseif \i == 6 557 pxor %xmm6, %xmm7 558 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 559 pxor %xmm7, %xmm8 560 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 561.elseif \i == 7 562 pxor %xmm7, %xmm8 563 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 564.endif 565 cmp $64, %r13 566 jl _initial_blocks_done\num_initial_blocks\operation 567 # no need for precomputed values 568/* 569* 570* Precomputations for HashKey parallel with encryption of first 4 blocks. 571* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 572*/ 573 MOVADQ ONE(%RIP),\TMP1 574 paddd \TMP1, \XMM0 # INCR Y0 575 MOVADQ \XMM0, \XMM1 576 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 577 578 paddd \TMP1, \XMM0 # INCR Y0 579 MOVADQ \XMM0, \XMM2 580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 581 582 paddd \TMP1, \XMM0 # INCR Y0 583 MOVADQ \XMM0, \XMM3 584 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 585 586 paddd \TMP1, \XMM0 # INCR Y0 587 MOVADQ \XMM0, \XMM4 588 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 589 590 MOVADQ 0(%arg1),\TMP1 591 pxor \TMP1, \XMM1 592 pxor \TMP1, \XMM2 593 pxor \TMP1, \XMM3 594 pxor \TMP1, \XMM4 595 movdqa \TMP3, \TMP5 596 pshufd $78, \TMP3, \TMP1 597 pxor \TMP3, \TMP1 598 movdqa \TMP1, HashKey_k(%rsp) 599 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 600# TMP5 = HashKey^2<<1 (mod poly) 601 movdqa \TMP5, HashKey_2(%rsp) 602# HashKey_2 = HashKey^2<<1 (mod poly) 603 pshufd $78, \TMP5, \TMP1 604 pxor \TMP5, \TMP1 605 movdqa \TMP1, HashKey_2_k(%rsp) 606.irpc index, 1234 # do 4 rounds 607 movaps 0x10*\index(%arg1), \TMP1 608 AESENC \TMP1, \XMM1 609 AESENC \TMP1, \XMM2 610 AESENC \TMP1, \XMM3 611 AESENC \TMP1, \XMM4 612.endr 613 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 614# TMP5 = HashKey^3<<1 (mod poly) 615 movdqa \TMP5, HashKey_3(%rsp) 616 pshufd $78, \TMP5, \TMP1 617 pxor \TMP5, \TMP1 618 movdqa \TMP1, HashKey_3_k(%rsp) 619.irpc index, 56789 # do next 5 rounds 620 movaps 0x10*\index(%arg1), \TMP1 621 AESENC \TMP1, \XMM1 622 AESENC \TMP1, \XMM2 623 AESENC \TMP1, \XMM3 624 AESENC \TMP1, \XMM4 625.endr 626 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 627# TMP5 = HashKey^3<<1 (mod poly) 628 movdqa \TMP5, HashKey_4(%rsp) 629 pshufd $78, \TMP5, \TMP1 630 pxor \TMP5, \TMP1 631 movdqa \TMP1, HashKey_4_k(%rsp) 632 lea 0xa0(%arg1),%r10 633 mov keysize,%eax 634 shr $2,%eax # 128->4, 192->6, 256->8 635 sub $4,%eax # 128->0, 192->2, 256->4 636 jz aes_loop_pre_enc_done\num_initial_blocks 637 638aes_loop_pre_enc\num_initial_blocks: 639 MOVADQ (%r10),\TMP2 640.irpc index, 1234 641 AESENC \TMP2, %xmm\index 642.endr 643 add $16,%r10 644 sub $1,%eax 645 jnz aes_loop_pre_enc\num_initial_blocks 646 647aes_loop_pre_enc_done\num_initial_blocks: 648 MOVADQ (%r10), \TMP2 649 AESENCLAST \TMP2, \XMM1 650 AESENCLAST \TMP2, \XMM2 651 AESENCLAST \TMP2, \XMM3 652 AESENCLAST \TMP2, \XMM4 653 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 654 pxor \TMP1, \XMM1 655 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 656 pxor \TMP1, \XMM2 657 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 658 pxor \TMP1, \XMM3 659 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 660 pxor \TMP1, \XMM4 661 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 662 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 663 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 664 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 665 666 add $64, %r11 667 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 668 pxor \XMMDst, \XMM1 669# combine GHASHed value with the corresponding ciphertext 670 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 671 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 672 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 673 674_initial_blocks_done\num_initial_blocks\operation: 675 676.endm 677 678/* 679* encrypt 4 blocks at a time 680* ghash the 4 previously encrypted ciphertext blocks 681* arg1, %arg2, %arg3 are used as pointers only, not modified 682* %r11 is the data offset value 683*/ 684.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 685TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 686 687 movdqa \XMM1, \XMM5 688 movdqa \XMM2, \XMM6 689 movdqa \XMM3, \XMM7 690 movdqa \XMM4, \XMM8 691 692 movdqa SHUF_MASK(%rip), %xmm15 693 # multiply TMP5 * HashKey using karatsuba 694 695 movdqa \XMM5, \TMP4 696 pshufd $78, \XMM5, \TMP6 697 pxor \XMM5, \TMP6 698 paddd ONE(%rip), \XMM0 # INCR CNT 699 movdqa HashKey_4(%rsp), \TMP5 700 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 701 movdqa \XMM0, \XMM1 702 paddd ONE(%rip), \XMM0 # INCR CNT 703 movdqa \XMM0, \XMM2 704 paddd ONE(%rip), \XMM0 # INCR CNT 705 movdqa \XMM0, \XMM3 706 paddd ONE(%rip), \XMM0 # INCR CNT 707 movdqa \XMM0, \XMM4 708 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 709 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 710 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 711 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 712 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 713 714 pxor (%arg1), \XMM1 715 pxor (%arg1), \XMM2 716 pxor (%arg1), \XMM3 717 pxor (%arg1), \XMM4 718 movdqa HashKey_4_k(%rsp), \TMP5 719 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 720 movaps 0x10(%arg1), \TMP1 721 AESENC \TMP1, \XMM1 # Round 1 722 AESENC \TMP1, \XMM2 723 AESENC \TMP1, \XMM3 724 AESENC \TMP1, \XMM4 725 movaps 0x20(%arg1), \TMP1 726 AESENC \TMP1, \XMM1 # Round 2 727 AESENC \TMP1, \XMM2 728 AESENC \TMP1, \XMM3 729 AESENC \TMP1, \XMM4 730 movdqa \XMM6, \TMP1 731 pshufd $78, \XMM6, \TMP2 732 pxor \XMM6, \TMP2 733 movdqa HashKey_3(%rsp), \TMP5 734 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 735 movaps 0x30(%arg1), \TMP3 736 AESENC \TMP3, \XMM1 # Round 3 737 AESENC \TMP3, \XMM2 738 AESENC \TMP3, \XMM3 739 AESENC \TMP3, \XMM4 740 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 741 movaps 0x40(%arg1), \TMP3 742 AESENC \TMP3, \XMM1 # Round 4 743 AESENC \TMP3, \XMM2 744 AESENC \TMP3, \XMM3 745 AESENC \TMP3, \XMM4 746 movdqa HashKey_3_k(%rsp), \TMP5 747 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 748 movaps 0x50(%arg1), \TMP3 749 AESENC \TMP3, \XMM1 # Round 5 750 AESENC \TMP3, \XMM2 751 AESENC \TMP3, \XMM3 752 AESENC \TMP3, \XMM4 753 pxor \TMP1, \TMP4 754# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 755 pxor \XMM6, \XMM5 756 pxor \TMP2, \TMP6 757 movdqa \XMM7, \TMP1 758 pshufd $78, \XMM7, \TMP2 759 pxor \XMM7, \TMP2 760 movdqa HashKey_2(%rsp ), \TMP5 761 762 # Multiply TMP5 * HashKey using karatsuba 763 764 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 765 movaps 0x60(%arg1), \TMP3 766 AESENC \TMP3, \XMM1 # Round 6 767 AESENC \TMP3, \XMM2 768 AESENC \TMP3, \XMM3 769 AESENC \TMP3, \XMM4 770 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 771 movaps 0x70(%arg1), \TMP3 772 AESENC \TMP3, \XMM1 # Round 7 773 AESENC \TMP3, \XMM2 774 AESENC \TMP3, \XMM3 775 AESENC \TMP3, \XMM4 776 movdqa HashKey_2_k(%rsp), \TMP5 777 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 778 movaps 0x80(%arg1), \TMP3 779 AESENC \TMP3, \XMM1 # Round 8 780 AESENC \TMP3, \XMM2 781 AESENC \TMP3, \XMM3 782 AESENC \TMP3, \XMM4 783 pxor \TMP1, \TMP4 784# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 785 pxor \XMM7, \XMM5 786 pxor \TMP2, \TMP6 787 788 # Multiply XMM8 * HashKey 789 # XMM8 and TMP5 hold the values for the two operands 790 791 movdqa \XMM8, \TMP1 792 pshufd $78, \XMM8, \TMP2 793 pxor \XMM8, \TMP2 794 movdqa HashKey(%rsp), \TMP5 795 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 796 movaps 0x90(%arg1), \TMP3 797 AESENC \TMP3, \XMM1 # Round 9 798 AESENC \TMP3, \XMM2 799 AESENC \TMP3, \XMM3 800 AESENC \TMP3, \XMM4 801 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 802 lea 0xa0(%arg1),%r10 803 mov keysize,%eax 804 shr $2,%eax # 128->4, 192->6, 256->8 805 sub $4,%eax # 128->0, 192->2, 256->4 806 jz aes_loop_par_enc_done 807 808aes_loop_par_enc: 809 MOVADQ (%r10),\TMP3 810.irpc index, 1234 811 AESENC \TMP3, %xmm\index 812.endr 813 add $16,%r10 814 sub $1,%eax 815 jnz aes_loop_par_enc 816 817aes_loop_par_enc_done: 818 MOVADQ (%r10), \TMP3 819 AESENCLAST \TMP3, \XMM1 # Round 10 820 AESENCLAST \TMP3, \XMM2 821 AESENCLAST \TMP3, \XMM3 822 AESENCLAST \TMP3, \XMM4 823 movdqa HashKey_k(%rsp), \TMP5 824 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 825 movdqu (%arg3,%r11,1), \TMP3 826 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 827 movdqu 16(%arg3,%r11,1), \TMP3 828 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 829 movdqu 32(%arg3,%r11,1), \TMP3 830 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 831 movdqu 48(%arg3,%r11,1), \TMP3 832 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 833 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 834 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 835 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 836 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 837 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 838 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 839 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 840 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 841 842 pxor \TMP4, \TMP1 843 pxor \XMM8, \XMM5 844 pxor \TMP6, \TMP2 845 pxor \TMP1, \TMP2 846 pxor \XMM5, \TMP2 847 movdqa \TMP2, \TMP3 848 pslldq $8, \TMP3 # left shift TMP3 2 DWs 849 psrldq $8, \TMP2 # right shift TMP2 2 DWs 850 pxor \TMP3, \XMM5 851 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 852 853 # first phase of reduction 854 855 movdqa \XMM5, \TMP2 856 movdqa \XMM5, \TMP3 857 movdqa \XMM5, \TMP4 858# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 859 pslld $31, \TMP2 # packed right shift << 31 860 pslld $30, \TMP3 # packed right shift << 30 861 pslld $25, \TMP4 # packed right shift << 25 862 pxor \TMP3, \TMP2 # xor the shifted versions 863 pxor \TMP4, \TMP2 864 movdqa \TMP2, \TMP5 865 psrldq $4, \TMP5 # right shift T5 1 DW 866 pslldq $12, \TMP2 # left shift T2 3 DWs 867 pxor \TMP2, \XMM5 868 869 # second phase of reduction 870 871 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 872 movdqa \XMM5,\TMP3 873 movdqa \XMM5,\TMP4 874 psrld $1, \TMP2 # packed left shift >>1 875 psrld $2, \TMP3 # packed left shift >>2 876 psrld $7, \TMP4 # packed left shift >>7 877 pxor \TMP3,\TMP2 # xor the shifted versions 878 pxor \TMP4,\TMP2 879 pxor \TMP5, \TMP2 880 pxor \TMP2, \XMM5 881 pxor \TMP1, \XMM5 # result is in TMP1 882 883 pxor \XMM5, \XMM1 884.endm 885 886/* 887* decrypt 4 blocks at a time 888* ghash the 4 previously decrypted ciphertext blocks 889* arg1, %arg2, %arg3 are used as pointers only, not modified 890* %r11 is the data offset value 891*/ 892.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 893TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 894 895 movdqa \XMM1, \XMM5 896 movdqa \XMM2, \XMM6 897 movdqa \XMM3, \XMM7 898 movdqa \XMM4, \XMM8 899 900 movdqa SHUF_MASK(%rip), %xmm15 901 # multiply TMP5 * HashKey using karatsuba 902 903 movdqa \XMM5, \TMP4 904 pshufd $78, \XMM5, \TMP6 905 pxor \XMM5, \TMP6 906 paddd ONE(%rip), \XMM0 # INCR CNT 907 movdqa HashKey_4(%rsp), \TMP5 908 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 909 movdqa \XMM0, \XMM1 910 paddd ONE(%rip), \XMM0 # INCR CNT 911 movdqa \XMM0, \XMM2 912 paddd ONE(%rip), \XMM0 # INCR CNT 913 movdqa \XMM0, \XMM3 914 paddd ONE(%rip), \XMM0 # INCR CNT 915 movdqa \XMM0, \XMM4 916 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 917 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 918 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 919 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 920 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 921 922 pxor (%arg1), \XMM1 923 pxor (%arg1), \XMM2 924 pxor (%arg1), \XMM3 925 pxor (%arg1), \XMM4 926 movdqa HashKey_4_k(%rsp), \TMP5 927 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 928 movaps 0x10(%arg1), \TMP1 929 AESENC \TMP1, \XMM1 # Round 1 930 AESENC \TMP1, \XMM2 931 AESENC \TMP1, \XMM3 932 AESENC \TMP1, \XMM4 933 movaps 0x20(%arg1), \TMP1 934 AESENC \TMP1, \XMM1 # Round 2 935 AESENC \TMP1, \XMM2 936 AESENC \TMP1, \XMM3 937 AESENC \TMP1, \XMM4 938 movdqa \XMM6, \TMP1 939 pshufd $78, \XMM6, \TMP2 940 pxor \XMM6, \TMP2 941 movdqa HashKey_3(%rsp), \TMP5 942 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 943 movaps 0x30(%arg1), \TMP3 944 AESENC \TMP3, \XMM1 # Round 3 945 AESENC \TMP3, \XMM2 946 AESENC \TMP3, \XMM3 947 AESENC \TMP3, \XMM4 948 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 949 movaps 0x40(%arg1), \TMP3 950 AESENC \TMP3, \XMM1 # Round 4 951 AESENC \TMP3, \XMM2 952 AESENC \TMP3, \XMM3 953 AESENC \TMP3, \XMM4 954 movdqa HashKey_3_k(%rsp), \TMP5 955 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 956 movaps 0x50(%arg1), \TMP3 957 AESENC \TMP3, \XMM1 # Round 5 958 AESENC \TMP3, \XMM2 959 AESENC \TMP3, \XMM3 960 AESENC \TMP3, \XMM4 961 pxor \TMP1, \TMP4 962# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 963 pxor \XMM6, \XMM5 964 pxor \TMP2, \TMP6 965 movdqa \XMM7, \TMP1 966 pshufd $78, \XMM7, \TMP2 967 pxor \XMM7, \TMP2 968 movdqa HashKey_2(%rsp ), \TMP5 969 970 # Multiply TMP5 * HashKey using karatsuba 971 972 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 973 movaps 0x60(%arg1), \TMP3 974 AESENC \TMP3, \XMM1 # Round 6 975 AESENC \TMP3, \XMM2 976 AESENC \TMP3, \XMM3 977 AESENC \TMP3, \XMM4 978 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 979 movaps 0x70(%arg1), \TMP3 980 AESENC \TMP3, \XMM1 # Round 7 981 AESENC \TMP3, \XMM2 982 AESENC \TMP3, \XMM3 983 AESENC \TMP3, \XMM4 984 movdqa HashKey_2_k(%rsp), \TMP5 985 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 986 movaps 0x80(%arg1), \TMP3 987 AESENC \TMP3, \XMM1 # Round 8 988 AESENC \TMP3, \XMM2 989 AESENC \TMP3, \XMM3 990 AESENC \TMP3, \XMM4 991 pxor \TMP1, \TMP4 992# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 993 pxor \XMM7, \XMM5 994 pxor \TMP2, \TMP6 995 996 # Multiply XMM8 * HashKey 997 # XMM8 and TMP5 hold the values for the two operands 998 999 movdqa \XMM8, \TMP1 1000 pshufd $78, \XMM8, \TMP2 1001 pxor \XMM8, \TMP2 1002 movdqa HashKey(%rsp), \TMP5 1003 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1004 movaps 0x90(%arg1), \TMP3 1005 AESENC \TMP3, \XMM1 # Round 9 1006 AESENC \TMP3, \XMM2 1007 AESENC \TMP3, \XMM3 1008 AESENC \TMP3, \XMM4 1009 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1010 lea 0xa0(%arg1),%r10 1011 mov keysize,%eax 1012 shr $2,%eax # 128->4, 192->6, 256->8 1013 sub $4,%eax # 128->0, 192->2, 256->4 1014 jz aes_loop_par_dec_done 1015 1016aes_loop_par_dec: 1017 MOVADQ (%r10),\TMP3 1018.irpc index, 1234 1019 AESENC \TMP3, %xmm\index 1020.endr 1021 add $16,%r10 1022 sub $1,%eax 1023 jnz aes_loop_par_dec 1024 1025aes_loop_par_dec_done: 1026 MOVADQ (%r10), \TMP3 1027 AESENCLAST \TMP3, \XMM1 # last round 1028 AESENCLAST \TMP3, \XMM2 1029 AESENCLAST \TMP3, \XMM3 1030 AESENCLAST \TMP3, \XMM4 1031 movdqa HashKey_k(%rsp), \TMP5 1032 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1033 movdqu (%arg3,%r11,1), \TMP3 1034 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1035 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1036 movdqa \TMP3, \XMM1 1037 movdqu 16(%arg3,%r11,1), \TMP3 1038 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1039 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1040 movdqa \TMP3, \XMM2 1041 movdqu 32(%arg3,%r11,1), \TMP3 1042 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1043 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1044 movdqa \TMP3, \XMM3 1045 movdqu 48(%arg3,%r11,1), \TMP3 1046 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1047 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1048 movdqa \TMP3, \XMM4 1049 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1050 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1051 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1052 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1053 1054 pxor \TMP4, \TMP1 1055 pxor \XMM8, \XMM5 1056 pxor \TMP6, \TMP2 1057 pxor \TMP1, \TMP2 1058 pxor \XMM5, \TMP2 1059 movdqa \TMP2, \TMP3 1060 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1061 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1062 pxor \TMP3, \XMM5 1063 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1064 1065 # first phase of reduction 1066 1067 movdqa \XMM5, \TMP2 1068 movdqa \XMM5, \TMP3 1069 movdqa \XMM5, \TMP4 1070# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1071 pslld $31, \TMP2 # packed right shift << 31 1072 pslld $30, \TMP3 # packed right shift << 30 1073 pslld $25, \TMP4 # packed right shift << 25 1074 pxor \TMP3, \TMP2 # xor the shifted versions 1075 pxor \TMP4, \TMP2 1076 movdqa \TMP2, \TMP5 1077 psrldq $4, \TMP5 # right shift T5 1 DW 1078 pslldq $12, \TMP2 # left shift T2 3 DWs 1079 pxor \TMP2, \XMM5 1080 1081 # second phase of reduction 1082 1083 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1084 movdqa \XMM5,\TMP3 1085 movdqa \XMM5,\TMP4 1086 psrld $1, \TMP2 # packed left shift >>1 1087 psrld $2, \TMP3 # packed left shift >>2 1088 psrld $7, \TMP4 # packed left shift >>7 1089 pxor \TMP3,\TMP2 # xor the shifted versions 1090 pxor \TMP4,\TMP2 1091 pxor \TMP5, \TMP2 1092 pxor \TMP2, \XMM5 1093 pxor \TMP1, \XMM5 # result is in TMP1 1094 1095 pxor \XMM5, \XMM1 1096.endm 1097 1098/* GHASH the last 4 ciphertext blocks. */ 1099.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1100TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1101 1102 # Multiply TMP6 * HashKey (using Karatsuba) 1103 1104 movdqa \XMM1, \TMP6 1105 pshufd $78, \XMM1, \TMP2 1106 pxor \XMM1, \TMP2 1107 movdqa HashKey_4(%rsp), \TMP5 1108 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1109 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1110 movdqa HashKey_4_k(%rsp), \TMP4 1111 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1112 movdqa \XMM1, \XMMDst 1113 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1114 1115 # Multiply TMP1 * HashKey (using Karatsuba) 1116 1117 movdqa \XMM2, \TMP1 1118 pshufd $78, \XMM2, \TMP2 1119 pxor \XMM2, \TMP2 1120 movdqa HashKey_3(%rsp), \TMP5 1121 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1122 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1123 movdqa HashKey_3_k(%rsp), \TMP4 1124 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1125 pxor \TMP1, \TMP6 1126 pxor \XMM2, \XMMDst 1127 pxor \TMP2, \XMM1 1128# results accumulated in TMP6, XMMDst, XMM1 1129 1130 # Multiply TMP1 * HashKey (using Karatsuba) 1131 1132 movdqa \XMM3, \TMP1 1133 pshufd $78, \XMM3, \TMP2 1134 pxor \XMM3, \TMP2 1135 movdqa HashKey_2(%rsp), \TMP5 1136 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1137 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1138 movdqa HashKey_2_k(%rsp), \TMP4 1139 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1140 pxor \TMP1, \TMP6 1141 pxor \XMM3, \XMMDst 1142 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1143 1144 # Multiply TMP1 * HashKey (using Karatsuba) 1145 movdqa \XMM4, \TMP1 1146 pshufd $78, \XMM4, \TMP2 1147 pxor \XMM4, \TMP2 1148 movdqa HashKey(%rsp), \TMP5 1149 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1150 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1151 movdqa HashKey_k(%rsp), \TMP4 1152 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1153 pxor \TMP1, \TMP6 1154 pxor \XMM4, \XMMDst 1155 pxor \XMM1, \TMP2 1156 pxor \TMP6, \TMP2 1157 pxor \XMMDst, \TMP2 1158 # middle section of the temp results combined as in karatsuba algorithm 1159 movdqa \TMP2, \TMP4 1160 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1161 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1162 pxor \TMP4, \XMMDst 1163 pxor \TMP2, \TMP6 1164# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1165 # first phase of the reduction 1166 movdqa \XMMDst, \TMP2 1167 movdqa \XMMDst, \TMP3 1168 movdqa \XMMDst, \TMP4 1169# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1170 pslld $31, \TMP2 # packed right shifting << 31 1171 pslld $30, \TMP3 # packed right shifting << 30 1172 pslld $25, \TMP4 # packed right shifting << 25 1173 pxor \TMP3, \TMP2 # xor the shifted versions 1174 pxor \TMP4, \TMP2 1175 movdqa \TMP2, \TMP7 1176 psrldq $4, \TMP7 # right shift TMP7 1 DW 1177 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1178 pxor \TMP2, \XMMDst 1179 1180 # second phase of the reduction 1181 movdqa \XMMDst, \TMP2 1182 # make 3 copies of XMMDst for doing 3 shift operations 1183 movdqa \XMMDst, \TMP3 1184 movdqa \XMMDst, \TMP4 1185 psrld $1, \TMP2 # packed left shift >> 1 1186 psrld $2, \TMP3 # packed left shift >> 2 1187 psrld $7, \TMP4 # packed left shift >> 7 1188 pxor \TMP3, \TMP2 # xor the shifted versions 1189 pxor \TMP4, \TMP2 1190 pxor \TMP7, \TMP2 1191 pxor \TMP2, \XMMDst 1192 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1193.endm 1194 1195 1196/* Encryption of a single block 1197* uses eax & r10 1198*/ 1199 1200.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1201 1202 pxor (%arg1), \XMM0 1203 mov keysize,%eax 1204 shr $2,%eax # 128->4, 192->6, 256->8 1205 add $5,%eax # 128->9, 192->11, 256->13 1206 lea 16(%arg1), %r10 # get first expanded key address 1207 1208_esb_loop_\@: 1209 MOVADQ (%r10),\TMP1 1210 AESENC \TMP1,\XMM0 1211 add $16,%r10 1212 sub $1,%eax 1213 jnz _esb_loop_\@ 1214 1215 MOVADQ (%r10),\TMP1 1216 AESENCLAST \TMP1,\XMM0 1217.endm 1218/***************************************************************************** 1219* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1220* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1221* const u8 *in, // Ciphertext input 1222* u64 plaintext_len, // Length of data in bytes for decryption. 1223* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1224* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1225* // concatenated with 0x00000001. 16-byte aligned pointer. 1226* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1227* const u8 *aad, // Additional Authentication Data (AAD) 1228* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1229* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1230* // given authentication tag and only return the plaintext if they match. 1231* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1232* // (most likely), 12 or 8. 1233* 1234* Assumptions: 1235* 1236* keys: 1237* keys are pre-expanded and aligned to 16 bytes. we are using the first 1238* set of 11 keys in the data structure void *aes_ctx 1239* 1240* iv: 1241* 0 1 2 3 1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1244* | Salt (From the SA) | 1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1246* | Initialization Vector | 1247* | (This is the sequence number from IPSec header) | 1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1249* | 0x1 | 1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1251* 1252* 1253* 1254* AAD: 1255* AAD padded to 128 bits with 0 1256* for example, assume AAD is a u32 vector 1257* 1258* if AAD is 8 bytes: 1259* AAD[3] = {A0, A1}; 1260* padded AAD in xmm register = {A1 A0 0 0} 1261* 1262* 0 1 2 3 1263* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1264* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1265* | SPI (A1) | 1266* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1267* | 32-bit Sequence Number (A0) | 1268* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1269* | 0x0 | 1270* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1271* 1272* AAD Format with 32-bit Sequence Number 1273* 1274* if AAD is 12 bytes: 1275* AAD[3] = {A0, A1, A2}; 1276* padded AAD in xmm register = {A2 A1 A0 0} 1277* 1278* 0 1 2 3 1279* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1280* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1281* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1282* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1283* | SPI (A2) | 1284* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1285* | 64-bit Extended Sequence Number {A1,A0} | 1286* | | 1287* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1288* | 0x0 | 1289* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1290* 1291* AAD Format with 64-bit Extended Sequence Number 1292* 1293* aadLen: 1294* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1295* The code supports 16 too but for other sizes, the code will fail. 1296* 1297* TLen: 1298* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1299* For other sizes, the code will fail. 1300* 1301* poly = x^128 + x^127 + x^126 + x^121 + 1 1302* 1303*****************************************************************************/ 1304ENTRY(aesni_gcm_dec) 1305 push %r12 1306 push %r13 1307 push %r14 1308 mov %rsp, %r14 1309/* 1310* states of %xmm registers %xmm6:%xmm15 not saved 1311* all %xmm registers are clobbered 1312*/ 1313 sub $VARIABLE_OFFSET, %rsp 1314 and $~63, %rsp # align rsp to 64 bytes 1315 mov %arg6, %r12 1316 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1317 movdqa SHUF_MASK(%rip), %xmm2 1318 PSHUFB_XMM %xmm2, %xmm13 1319 1320 1321# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1322 1323 movdqa %xmm13, %xmm2 1324 psllq $1, %xmm13 1325 psrlq $63, %xmm2 1326 movdqa %xmm2, %xmm1 1327 pslldq $8, %xmm2 1328 psrldq $8, %xmm1 1329 por %xmm2, %xmm13 1330 1331 # Reduction 1332 1333 pshufd $0x24, %xmm1, %xmm2 1334 pcmpeqd TWOONE(%rip), %xmm2 1335 pand POLY(%rip), %xmm2 1336 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1337 1338 1339 # Decrypt first few blocks 1340 1341 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1342 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1343 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1344 mov %r13, %r12 1345 and $(3<<4), %r12 1346 jz _initial_num_blocks_is_0_decrypt 1347 cmp $(2<<4), %r12 1348 jb _initial_num_blocks_is_1_decrypt 1349 je _initial_num_blocks_is_2_decrypt 1350_initial_num_blocks_is_3_decrypt: 1351 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1352%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1353 sub $48, %r13 1354 jmp _initial_blocks_decrypted 1355_initial_num_blocks_is_2_decrypt: 1356 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1357%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1358 sub $32, %r13 1359 jmp _initial_blocks_decrypted 1360_initial_num_blocks_is_1_decrypt: 1361 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1362%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1363 sub $16, %r13 1364 jmp _initial_blocks_decrypted 1365_initial_num_blocks_is_0_decrypt: 1366 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1367%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1368_initial_blocks_decrypted: 1369 cmp $0, %r13 1370 je _zero_cipher_left_decrypt 1371 sub $64, %r13 1372 je _four_cipher_left_decrypt 1373_decrypt_by_4: 1374 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1375%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1376 add $64, %r11 1377 sub $64, %r13 1378 jne _decrypt_by_4 1379_four_cipher_left_decrypt: 1380 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1381%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1382_zero_cipher_left_decrypt: 1383 mov %arg4, %r13 1384 and $15, %r13 # %r13 = arg4 (mod 16) 1385 je _multiple_of_16_bytes_decrypt 1386 1387 # Handle the last <16 byte block separately 1388 1389 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1390 movdqa SHUF_MASK(%rip), %xmm10 1391 PSHUFB_XMM %xmm10, %xmm0 1392 1393 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1394 sub $16, %r11 1395 add %r13, %r11 1396 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1397 lea SHIFT_MASK+16(%rip), %r12 1398 sub %r13, %r12 1399# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1400# (%r13 is the number of bytes in plaintext mod 16) 1401 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1402 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1403 1404 movdqa %xmm1, %xmm2 1405 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1406 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1407 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1408 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1409 pand %xmm1, %xmm2 1410 movdqa SHUF_MASK(%rip), %xmm10 1411 PSHUFB_XMM %xmm10 ,%xmm2 1412 1413 pxor %xmm2, %xmm8 1414 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1415 # GHASH computation for the last <16 byte block 1416 sub %r13, %r11 1417 add $16, %r11 1418 1419 # output %r13 bytes 1420 MOVQ_R64_XMM %xmm0, %rax 1421 cmp $8, %r13 1422 jle _less_than_8_bytes_left_decrypt 1423 mov %rax, (%arg2 , %r11, 1) 1424 add $8, %r11 1425 psrldq $8, %xmm0 1426 MOVQ_R64_XMM %xmm0, %rax 1427 sub $8, %r13 1428_less_than_8_bytes_left_decrypt: 1429 mov %al, (%arg2, %r11, 1) 1430 add $1, %r11 1431 shr $8, %rax 1432 sub $1, %r13 1433 jne _less_than_8_bytes_left_decrypt 1434_multiple_of_16_bytes_decrypt: 1435 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1436 shl $3, %r12 # convert into number of bits 1437 movd %r12d, %xmm15 # len(A) in %xmm15 1438 shl $3, %arg4 # len(C) in bits (*128) 1439 MOVQ_R64_XMM %arg4, %xmm1 1440 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1441 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1442 pxor %xmm15, %xmm8 1443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1444 # final GHASH computation 1445 movdqa SHUF_MASK(%rip), %xmm10 1446 PSHUFB_XMM %xmm10, %xmm8 1447 1448 mov %arg5, %rax # %rax = *Y0 1449 movdqu (%rax), %xmm0 # %xmm0 = Y0 1450 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1451 pxor %xmm8, %xmm0 1452_return_T_decrypt: 1453 mov arg9, %r10 # %r10 = authTag 1454 mov arg10, %r11 # %r11 = auth_tag_len 1455 cmp $16, %r11 1456 je _T_16_decrypt 1457 cmp $12, %r11 1458 je _T_12_decrypt 1459_T_8_decrypt: 1460 MOVQ_R64_XMM %xmm0, %rax 1461 mov %rax, (%r10) 1462 jmp _return_T_done_decrypt 1463_T_12_decrypt: 1464 MOVQ_R64_XMM %xmm0, %rax 1465 mov %rax, (%r10) 1466 psrldq $8, %xmm0 1467 movd %xmm0, %eax 1468 mov %eax, 8(%r10) 1469 jmp _return_T_done_decrypt 1470_T_16_decrypt: 1471 movdqu %xmm0, (%r10) 1472_return_T_done_decrypt: 1473 mov %r14, %rsp 1474 pop %r14 1475 pop %r13 1476 pop %r12 1477 ret 1478ENDPROC(aesni_gcm_dec) 1479 1480 1481/***************************************************************************** 1482* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1483* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1484* const u8 *in, // Plaintext input 1485* u64 plaintext_len, // Length of data in bytes for encryption. 1486* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1487* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1488* // concatenated with 0x00000001. 16-byte aligned pointer. 1489* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1490* const u8 *aad, // Additional Authentication Data (AAD) 1491* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1492* u8 *auth_tag, // Authenticated Tag output. 1493* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1494* // 12 or 8. 1495* 1496* Assumptions: 1497* 1498* keys: 1499* keys are pre-expanded and aligned to 16 bytes. we are using the 1500* first set of 11 keys in the data structure void *aes_ctx 1501* 1502* 1503* iv: 1504* 0 1 2 3 1505* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1507* | Salt (From the SA) | 1508* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1509* | Initialization Vector | 1510* | (This is the sequence number from IPSec header) | 1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1512* | 0x1 | 1513* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1514* 1515* 1516* 1517* AAD: 1518* AAD padded to 128 bits with 0 1519* for example, assume AAD is a u32 vector 1520* 1521* if AAD is 8 bytes: 1522* AAD[3] = {A0, A1}; 1523* padded AAD in xmm register = {A1 A0 0 0} 1524* 1525* 0 1 2 3 1526* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1527* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1528* | SPI (A1) | 1529* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1530* | 32-bit Sequence Number (A0) | 1531* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1532* | 0x0 | 1533* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1534* 1535* AAD Format with 32-bit Sequence Number 1536* 1537* if AAD is 12 bytes: 1538* AAD[3] = {A0, A1, A2}; 1539* padded AAD in xmm register = {A2 A1 A0 0} 1540* 1541* 0 1 2 3 1542* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1543* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1544* | SPI (A2) | 1545* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1546* | 64-bit Extended Sequence Number {A1,A0} | 1547* | | 1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1549* | 0x0 | 1550* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1551* 1552* AAD Format with 64-bit Extended Sequence Number 1553* 1554* aadLen: 1555* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1556* The code supports 16 too but for other sizes, the code will fail. 1557* 1558* TLen: 1559* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1560* For other sizes, the code will fail. 1561* 1562* poly = x^128 + x^127 + x^126 + x^121 + 1 1563***************************************************************************/ 1564ENTRY(aesni_gcm_enc) 1565 push %r12 1566 push %r13 1567 push %r14 1568 mov %rsp, %r14 1569# 1570# states of %xmm registers %xmm6:%xmm15 not saved 1571# all %xmm registers are clobbered 1572# 1573 sub $VARIABLE_OFFSET, %rsp 1574 and $~63, %rsp 1575 mov %arg6, %r12 1576 movdqu (%r12), %xmm13 1577 movdqa SHUF_MASK(%rip), %xmm2 1578 PSHUFB_XMM %xmm2, %xmm13 1579 1580 1581# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1582 1583 movdqa %xmm13, %xmm2 1584 psllq $1, %xmm13 1585 psrlq $63, %xmm2 1586 movdqa %xmm2, %xmm1 1587 pslldq $8, %xmm2 1588 psrldq $8, %xmm1 1589 por %xmm2, %xmm13 1590 1591 # reduce HashKey<<1 1592 1593 pshufd $0x24, %xmm1, %xmm2 1594 pcmpeqd TWOONE(%rip), %xmm2 1595 pand POLY(%rip), %xmm2 1596 pxor %xmm2, %xmm13 1597 movdqa %xmm13, HashKey(%rsp) 1598 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1599 and $-16, %r13 1600 mov %r13, %r12 1601 1602 # Encrypt first few blocks 1603 1604 and $(3<<4), %r12 1605 jz _initial_num_blocks_is_0_encrypt 1606 cmp $(2<<4), %r12 1607 jb _initial_num_blocks_is_1_encrypt 1608 je _initial_num_blocks_is_2_encrypt 1609_initial_num_blocks_is_3_encrypt: 1610 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1611%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1612 sub $48, %r13 1613 jmp _initial_blocks_encrypted 1614_initial_num_blocks_is_2_encrypt: 1615 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1616%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1617 sub $32, %r13 1618 jmp _initial_blocks_encrypted 1619_initial_num_blocks_is_1_encrypt: 1620 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1621%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1622 sub $16, %r13 1623 jmp _initial_blocks_encrypted 1624_initial_num_blocks_is_0_encrypt: 1625 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1626%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1627_initial_blocks_encrypted: 1628 1629 # Main loop - Encrypt remaining blocks 1630 1631 cmp $0, %r13 1632 je _zero_cipher_left_encrypt 1633 sub $64, %r13 1634 je _four_cipher_left_encrypt 1635_encrypt_by_4_encrypt: 1636 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1637%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1638 add $64, %r11 1639 sub $64, %r13 1640 jne _encrypt_by_4_encrypt 1641_four_cipher_left_encrypt: 1642 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1643%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1644_zero_cipher_left_encrypt: 1645 mov %arg4, %r13 1646 and $15, %r13 # %r13 = arg4 (mod 16) 1647 je _multiple_of_16_bytes_encrypt 1648 1649 # Handle the last <16 Byte block separately 1650 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1651 movdqa SHUF_MASK(%rip), %xmm10 1652 PSHUFB_XMM %xmm10, %xmm0 1653 1654 1655 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1656 sub $16, %r11 1657 add %r13, %r11 1658 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1659 lea SHIFT_MASK+16(%rip), %r12 1660 sub %r13, %r12 1661 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1662 # (%r13 is the number of bytes in plaintext mod 16) 1663 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1664 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1665 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1666 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1667 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1668 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1669 movdqa SHUF_MASK(%rip), %xmm10 1670 PSHUFB_XMM %xmm10,%xmm0 1671 1672 pxor %xmm0, %xmm8 1673 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1674 # GHASH computation for the last <16 byte block 1675 sub %r13, %r11 1676 add $16, %r11 1677 1678 movdqa SHUF_MASK(%rip), %xmm10 1679 PSHUFB_XMM %xmm10, %xmm0 1680 1681 # shuffle xmm0 back to output as ciphertext 1682 1683 # Output %r13 bytes 1684 MOVQ_R64_XMM %xmm0, %rax 1685 cmp $8, %r13 1686 jle _less_than_8_bytes_left_encrypt 1687 mov %rax, (%arg2 , %r11, 1) 1688 add $8, %r11 1689 psrldq $8, %xmm0 1690 MOVQ_R64_XMM %xmm0, %rax 1691 sub $8, %r13 1692_less_than_8_bytes_left_encrypt: 1693 mov %al, (%arg2, %r11, 1) 1694 add $1, %r11 1695 shr $8, %rax 1696 sub $1, %r13 1697 jne _less_than_8_bytes_left_encrypt 1698_multiple_of_16_bytes_encrypt: 1699 mov arg8, %r12 # %r12 = addLen (number of bytes) 1700 shl $3, %r12 1701 movd %r12d, %xmm15 # len(A) in %xmm15 1702 shl $3, %arg4 # len(C) in bits (*128) 1703 MOVQ_R64_XMM %arg4, %xmm1 1704 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1705 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1706 pxor %xmm15, %xmm8 1707 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1708 # final GHASH computation 1709 movdqa SHUF_MASK(%rip), %xmm10 1710 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1711 1712 mov %arg5, %rax # %rax = *Y0 1713 movdqu (%rax), %xmm0 # %xmm0 = Y0 1714 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1715 pxor %xmm8, %xmm0 1716_return_T_encrypt: 1717 mov arg9, %r10 # %r10 = authTag 1718 mov arg10, %r11 # %r11 = auth_tag_len 1719 cmp $16, %r11 1720 je _T_16_encrypt 1721 cmp $12, %r11 1722 je _T_12_encrypt 1723_T_8_encrypt: 1724 MOVQ_R64_XMM %xmm0, %rax 1725 mov %rax, (%r10) 1726 jmp _return_T_done_encrypt 1727_T_12_encrypt: 1728 MOVQ_R64_XMM %xmm0, %rax 1729 mov %rax, (%r10) 1730 psrldq $8, %xmm0 1731 movd %xmm0, %eax 1732 mov %eax, 8(%r10) 1733 jmp _return_T_done_encrypt 1734_T_16_encrypt: 1735 movdqu %xmm0, (%r10) 1736_return_T_done_encrypt: 1737 mov %r14, %rsp 1738 pop %r14 1739 pop %r13 1740 pop %r12 1741 ret 1742ENDPROC(aesni_gcm_enc) 1743 1744#endif 1745 1746 1747.align 4 1748_key_expansion_128: 1749_key_expansion_256a: 1750 pshufd $0b11111111, %xmm1, %xmm1 1751 shufps $0b00010000, %xmm0, %xmm4 1752 pxor %xmm4, %xmm0 1753 shufps $0b10001100, %xmm0, %xmm4 1754 pxor %xmm4, %xmm0 1755 pxor %xmm1, %xmm0 1756 movaps %xmm0, (TKEYP) 1757 add $0x10, TKEYP 1758 ret 1759ENDPROC(_key_expansion_128) 1760ENDPROC(_key_expansion_256a) 1761 1762.align 4 1763_key_expansion_192a: 1764 pshufd $0b01010101, %xmm1, %xmm1 1765 shufps $0b00010000, %xmm0, %xmm4 1766 pxor %xmm4, %xmm0 1767 shufps $0b10001100, %xmm0, %xmm4 1768 pxor %xmm4, %xmm0 1769 pxor %xmm1, %xmm0 1770 1771 movaps %xmm2, %xmm5 1772 movaps %xmm2, %xmm6 1773 pslldq $4, %xmm5 1774 pshufd $0b11111111, %xmm0, %xmm3 1775 pxor %xmm3, %xmm2 1776 pxor %xmm5, %xmm2 1777 1778 movaps %xmm0, %xmm1 1779 shufps $0b01000100, %xmm0, %xmm6 1780 movaps %xmm6, (TKEYP) 1781 shufps $0b01001110, %xmm2, %xmm1 1782 movaps %xmm1, 0x10(TKEYP) 1783 add $0x20, TKEYP 1784 ret 1785ENDPROC(_key_expansion_192a) 1786 1787.align 4 1788_key_expansion_192b: 1789 pshufd $0b01010101, %xmm1, %xmm1 1790 shufps $0b00010000, %xmm0, %xmm4 1791 pxor %xmm4, %xmm0 1792 shufps $0b10001100, %xmm0, %xmm4 1793 pxor %xmm4, %xmm0 1794 pxor %xmm1, %xmm0 1795 1796 movaps %xmm2, %xmm5 1797 pslldq $4, %xmm5 1798 pshufd $0b11111111, %xmm0, %xmm3 1799 pxor %xmm3, %xmm2 1800 pxor %xmm5, %xmm2 1801 1802 movaps %xmm0, (TKEYP) 1803 add $0x10, TKEYP 1804 ret 1805ENDPROC(_key_expansion_192b) 1806 1807.align 4 1808_key_expansion_256b: 1809 pshufd $0b10101010, %xmm1, %xmm1 1810 shufps $0b00010000, %xmm2, %xmm4 1811 pxor %xmm4, %xmm2 1812 shufps $0b10001100, %xmm2, %xmm4 1813 pxor %xmm4, %xmm2 1814 pxor %xmm1, %xmm2 1815 movaps %xmm2, (TKEYP) 1816 add $0x10, TKEYP 1817 ret 1818ENDPROC(_key_expansion_256b) 1819 1820/* 1821 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1822 * unsigned int key_len) 1823 */ 1824ENTRY(aesni_set_key) 1825 FRAME_BEGIN 1826#ifndef __x86_64__ 1827 pushl KEYP 1828 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1829 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1830 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1831#endif 1832 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1833 movaps %xmm0, (KEYP) 1834 lea 0x10(KEYP), TKEYP # key addr 1835 movl %edx, 480(KEYP) 1836 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1837 cmp $24, %dl 1838 jb .Lenc_key128 1839 je .Lenc_key192 1840 movups 0x10(UKEYP), %xmm2 # other user key 1841 movaps %xmm2, (TKEYP) 1842 add $0x10, TKEYP 1843 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1844 call _key_expansion_256a 1845 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1846 call _key_expansion_256b 1847 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1848 call _key_expansion_256a 1849 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1850 call _key_expansion_256b 1851 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1852 call _key_expansion_256a 1853 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1854 call _key_expansion_256b 1855 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1856 call _key_expansion_256a 1857 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1858 call _key_expansion_256b 1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1860 call _key_expansion_256a 1861 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1862 call _key_expansion_256b 1863 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1864 call _key_expansion_256a 1865 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1866 call _key_expansion_256b 1867 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1868 call _key_expansion_256a 1869 jmp .Ldec_key 1870.Lenc_key192: 1871 movq 0x10(UKEYP), %xmm2 # other user key 1872 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1873 call _key_expansion_192a 1874 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1875 call _key_expansion_192b 1876 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1877 call _key_expansion_192a 1878 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1879 call _key_expansion_192b 1880 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1881 call _key_expansion_192a 1882 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1883 call _key_expansion_192b 1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1885 call _key_expansion_192a 1886 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1887 call _key_expansion_192b 1888 jmp .Ldec_key 1889.Lenc_key128: 1890 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1891 call _key_expansion_128 1892 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1893 call _key_expansion_128 1894 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1895 call _key_expansion_128 1896 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1897 call _key_expansion_128 1898 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1899 call _key_expansion_128 1900 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1901 call _key_expansion_128 1902 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1903 call _key_expansion_128 1904 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1905 call _key_expansion_128 1906 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1907 call _key_expansion_128 1908 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1909 call _key_expansion_128 1910.Ldec_key: 1911 sub $0x10, TKEYP 1912 movaps (KEYP), %xmm0 1913 movaps (TKEYP), %xmm1 1914 movaps %xmm0, 240(TKEYP) 1915 movaps %xmm1, 240(KEYP) 1916 add $0x10, KEYP 1917 lea 240-16(TKEYP), UKEYP 1918.align 4 1919.Ldec_key_loop: 1920 movaps (KEYP), %xmm0 1921 AESIMC %xmm0 %xmm1 1922 movaps %xmm1, (UKEYP) 1923 add $0x10, KEYP 1924 sub $0x10, UKEYP 1925 cmp TKEYP, KEYP 1926 jb .Ldec_key_loop 1927 xor AREG, AREG 1928#ifndef __x86_64__ 1929 popl KEYP 1930#endif 1931 FRAME_END 1932 ret 1933ENDPROC(aesni_set_key) 1934 1935/* 1936 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1937 */ 1938ENTRY(aesni_enc) 1939 FRAME_BEGIN 1940#ifndef __x86_64__ 1941 pushl KEYP 1942 pushl KLEN 1943 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1944 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1945 movl (FRAME_OFFSET+20)(%esp), INP # src 1946#endif 1947 movl 480(KEYP), KLEN # key length 1948 movups (INP), STATE # input 1949 call _aesni_enc1 1950 movups STATE, (OUTP) # output 1951#ifndef __x86_64__ 1952 popl KLEN 1953 popl KEYP 1954#endif 1955 FRAME_END 1956 ret 1957ENDPROC(aesni_enc) 1958 1959/* 1960 * _aesni_enc1: internal ABI 1961 * input: 1962 * KEYP: key struct pointer 1963 * KLEN: round count 1964 * STATE: initial state (input) 1965 * output: 1966 * STATE: finial state (output) 1967 * changed: 1968 * KEY 1969 * TKEYP (T1) 1970 */ 1971.align 4 1972_aesni_enc1: 1973 movaps (KEYP), KEY # key 1974 mov KEYP, TKEYP 1975 pxor KEY, STATE # round 0 1976 add $0x30, TKEYP 1977 cmp $24, KLEN 1978 jb .Lenc128 1979 lea 0x20(TKEYP), TKEYP 1980 je .Lenc192 1981 add $0x20, TKEYP 1982 movaps -0x60(TKEYP), KEY 1983 AESENC KEY STATE 1984 movaps -0x50(TKEYP), KEY 1985 AESENC KEY STATE 1986.align 4 1987.Lenc192: 1988 movaps -0x40(TKEYP), KEY 1989 AESENC KEY STATE 1990 movaps -0x30(TKEYP), KEY 1991 AESENC KEY STATE 1992.align 4 1993.Lenc128: 1994 movaps -0x20(TKEYP), KEY 1995 AESENC KEY STATE 1996 movaps -0x10(TKEYP), KEY 1997 AESENC KEY STATE 1998 movaps (TKEYP), KEY 1999 AESENC KEY STATE 2000 movaps 0x10(TKEYP), KEY 2001 AESENC KEY STATE 2002 movaps 0x20(TKEYP), KEY 2003 AESENC KEY STATE 2004 movaps 0x30(TKEYP), KEY 2005 AESENC KEY STATE 2006 movaps 0x40(TKEYP), KEY 2007 AESENC KEY STATE 2008 movaps 0x50(TKEYP), KEY 2009 AESENC KEY STATE 2010 movaps 0x60(TKEYP), KEY 2011 AESENC KEY STATE 2012 movaps 0x70(TKEYP), KEY 2013 AESENCLAST KEY STATE 2014 ret 2015ENDPROC(_aesni_enc1) 2016 2017/* 2018 * _aesni_enc4: internal ABI 2019 * input: 2020 * KEYP: key struct pointer 2021 * KLEN: round count 2022 * STATE1: initial state (input) 2023 * STATE2 2024 * STATE3 2025 * STATE4 2026 * output: 2027 * STATE1: finial state (output) 2028 * STATE2 2029 * STATE3 2030 * STATE4 2031 * changed: 2032 * KEY 2033 * TKEYP (T1) 2034 */ 2035.align 4 2036_aesni_enc4: 2037 movaps (KEYP), KEY # key 2038 mov KEYP, TKEYP 2039 pxor KEY, STATE1 # round 0 2040 pxor KEY, STATE2 2041 pxor KEY, STATE3 2042 pxor KEY, STATE4 2043 add $0x30, TKEYP 2044 cmp $24, KLEN 2045 jb .L4enc128 2046 lea 0x20(TKEYP), TKEYP 2047 je .L4enc192 2048 add $0x20, TKEYP 2049 movaps -0x60(TKEYP), KEY 2050 AESENC KEY STATE1 2051 AESENC KEY STATE2 2052 AESENC KEY STATE3 2053 AESENC KEY STATE4 2054 movaps -0x50(TKEYP), KEY 2055 AESENC KEY STATE1 2056 AESENC KEY STATE2 2057 AESENC KEY STATE3 2058 AESENC KEY STATE4 2059#.align 4 2060.L4enc192: 2061 movaps -0x40(TKEYP), KEY 2062 AESENC KEY STATE1 2063 AESENC KEY STATE2 2064 AESENC KEY STATE3 2065 AESENC KEY STATE4 2066 movaps -0x30(TKEYP), KEY 2067 AESENC KEY STATE1 2068 AESENC KEY STATE2 2069 AESENC KEY STATE3 2070 AESENC KEY STATE4 2071#.align 4 2072.L4enc128: 2073 movaps -0x20(TKEYP), KEY 2074 AESENC KEY STATE1 2075 AESENC KEY STATE2 2076 AESENC KEY STATE3 2077 AESENC KEY STATE4 2078 movaps -0x10(TKEYP), KEY 2079 AESENC KEY STATE1 2080 AESENC KEY STATE2 2081 AESENC KEY STATE3 2082 AESENC KEY STATE4 2083 movaps (TKEYP), KEY 2084 AESENC KEY STATE1 2085 AESENC KEY STATE2 2086 AESENC KEY STATE3 2087 AESENC KEY STATE4 2088 movaps 0x10(TKEYP), KEY 2089 AESENC KEY STATE1 2090 AESENC KEY STATE2 2091 AESENC KEY STATE3 2092 AESENC KEY STATE4 2093 movaps 0x20(TKEYP), KEY 2094 AESENC KEY STATE1 2095 AESENC KEY STATE2 2096 AESENC KEY STATE3 2097 AESENC KEY STATE4 2098 movaps 0x30(TKEYP), KEY 2099 AESENC KEY STATE1 2100 AESENC KEY STATE2 2101 AESENC KEY STATE3 2102 AESENC KEY STATE4 2103 movaps 0x40(TKEYP), KEY 2104 AESENC KEY STATE1 2105 AESENC KEY STATE2 2106 AESENC KEY STATE3 2107 AESENC KEY STATE4 2108 movaps 0x50(TKEYP), KEY 2109 AESENC KEY STATE1 2110 AESENC KEY STATE2 2111 AESENC KEY STATE3 2112 AESENC KEY STATE4 2113 movaps 0x60(TKEYP), KEY 2114 AESENC KEY STATE1 2115 AESENC KEY STATE2 2116 AESENC KEY STATE3 2117 AESENC KEY STATE4 2118 movaps 0x70(TKEYP), KEY 2119 AESENCLAST KEY STATE1 # last round 2120 AESENCLAST KEY STATE2 2121 AESENCLAST KEY STATE3 2122 AESENCLAST KEY STATE4 2123 ret 2124ENDPROC(_aesni_enc4) 2125 2126/* 2127 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2128 */ 2129ENTRY(aesni_dec) 2130 FRAME_BEGIN 2131#ifndef __x86_64__ 2132 pushl KEYP 2133 pushl KLEN 2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2136 movl (FRAME_OFFSET+20)(%esp), INP # src 2137#endif 2138 mov 480(KEYP), KLEN # key length 2139 add $240, KEYP 2140 movups (INP), STATE # input 2141 call _aesni_dec1 2142 movups STATE, (OUTP) #output 2143#ifndef __x86_64__ 2144 popl KLEN 2145 popl KEYP 2146#endif 2147 FRAME_END 2148 ret 2149ENDPROC(aesni_dec) 2150 2151/* 2152 * _aesni_dec1: internal ABI 2153 * input: 2154 * KEYP: key struct pointer 2155 * KLEN: key length 2156 * STATE: initial state (input) 2157 * output: 2158 * STATE: finial state (output) 2159 * changed: 2160 * KEY 2161 * TKEYP (T1) 2162 */ 2163.align 4 2164_aesni_dec1: 2165 movaps (KEYP), KEY # key 2166 mov KEYP, TKEYP 2167 pxor KEY, STATE # round 0 2168 add $0x30, TKEYP 2169 cmp $24, KLEN 2170 jb .Ldec128 2171 lea 0x20(TKEYP), TKEYP 2172 je .Ldec192 2173 add $0x20, TKEYP 2174 movaps -0x60(TKEYP), KEY 2175 AESDEC KEY STATE 2176 movaps -0x50(TKEYP), KEY 2177 AESDEC KEY STATE 2178.align 4 2179.Ldec192: 2180 movaps -0x40(TKEYP), KEY 2181 AESDEC KEY STATE 2182 movaps -0x30(TKEYP), KEY 2183 AESDEC KEY STATE 2184.align 4 2185.Ldec128: 2186 movaps -0x20(TKEYP), KEY 2187 AESDEC KEY STATE 2188 movaps -0x10(TKEYP), KEY 2189 AESDEC KEY STATE 2190 movaps (TKEYP), KEY 2191 AESDEC KEY STATE 2192 movaps 0x10(TKEYP), KEY 2193 AESDEC KEY STATE 2194 movaps 0x20(TKEYP), KEY 2195 AESDEC KEY STATE 2196 movaps 0x30(TKEYP), KEY 2197 AESDEC KEY STATE 2198 movaps 0x40(TKEYP), KEY 2199 AESDEC KEY STATE 2200 movaps 0x50(TKEYP), KEY 2201 AESDEC KEY STATE 2202 movaps 0x60(TKEYP), KEY 2203 AESDEC KEY STATE 2204 movaps 0x70(TKEYP), KEY 2205 AESDECLAST KEY STATE 2206 ret 2207ENDPROC(_aesni_dec1) 2208 2209/* 2210 * _aesni_dec4: internal ABI 2211 * input: 2212 * KEYP: key struct pointer 2213 * KLEN: key length 2214 * STATE1: initial state (input) 2215 * STATE2 2216 * STATE3 2217 * STATE4 2218 * output: 2219 * STATE1: finial state (output) 2220 * STATE2 2221 * STATE3 2222 * STATE4 2223 * changed: 2224 * KEY 2225 * TKEYP (T1) 2226 */ 2227.align 4 2228_aesni_dec4: 2229 movaps (KEYP), KEY # key 2230 mov KEYP, TKEYP 2231 pxor KEY, STATE1 # round 0 2232 pxor KEY, STATE2 2233 pxor KEY, STATE3 2234 pxor KEY, STATE4 2235 add $0x30, TKEYP 2236 cmp $24, KLEN 2237 jb .L4dec128 2238 lea 0x20(TKEYP), TKEYP 2239 je .L4dec192 2240 add $0x20, TKEYP 2241 movaps -0x60(TKEYP), KEY 2242 AESDEC KEY STATE1 2243 AESDEC KEY STATE2 2244 AESDEC KEY STATE3 2245 AESDEC KEY STATE4 2246 movaps -0x50(TKEYP), KEY 2247 AESDEC KEY STATE1 2248 AESDEC KEY STATE2 2249 AESDEC KEY STATE3 2250 AESDEC KEY STATE4 2251.align 4 2252.L4dec192: 2253 movaps -0x40(TKEYP), KEY 2254 AESDEC KEY STATE1 2255 AESDEC KEY STATE2 2256 AESDEC KEY STATE3 2257 AESDEC KEY STATE4 2258 movaps -0x30(TKEYP), KEY 2259 AESDEC KEY STATE1 2260 AESDEC KEY STATE2 2261 AESDEC KEY STATE3 2262 AESDEC KEY STATE4 2263.align 4 2264.L4dec128: 2265 movaps -0x20(TKEYP), KEY 2266 AESDEC KEY STATE1 2267 AESDEC KEY STATE2 2268 AESDEC KEY STATE3 2269 AESDEC KEY STATE4 2270 movaps -0x10(TKEYP), KEY 2271 AESDEC KEY STATE1 2272 AESDEC KEY STATE2 2273 AESDEC KEY STATE3 2274 AESDEC KEY STATE4 2275 movaps (TKEYP), KEY 2276 AESDEC KEY STATE1 2277 AESDEC KEY STATE2 2278 AESDEC KEY STATE3 2279 AESDEC KEY STATE4 2280 movaps 0x10(TKEYP), KEY 2281 AESDEC KEY STATE1 2282 AESDEC KEY STATE2 2283 AESDEC KEY STATE3 2284 AESDEC KEY STATE4 2285 movaps 0x20(TKEYP), KEY 2286 AESDEC KEY STATE1 2287 AESDEC KEY STATE2 2288 AESDEC KEY STATE3 2289 AESDEC KEY STATE4 2290 movaps 0x30(TKEYP), KEY 2291 AESDEC KEY STATE1 2292 AESDEC KEY STATE2 2293 AESDEC KEY STATE3 2294 AESDEC KEY STATE4 2295 movaps 0x40(TKEYP), KEY 2296 AESDEC KEY STATE1 2297 AESDEC KEY STATE2 2298 AESDEC KEY STATE3 2299 AESDEC KEY STATE4 2300 movaps 0x50(TKEYP), KEY 2301 AESDEC KEY STATE1 2302 AESDEC KEY STATE2 2303 AESDEC KEY STATE3 2304 AESDEC KEY STATE4 2305 movaps 0x60(TKEYP), KEY 2306 AESDEC KEY STATE1 2307 AESDEC KEY STATE2 2308 AESDEC KEY STATE3 2309 AESDEC KEY STATE4 2310 movaps 0x70(TKEYP), KEY 2311 AESDECLAST KEY STATE1 # last round 2312 AESDECLAST KEY STATE2 2313 AESDECLAST KEY STATE3 2314 AESDECLAST KEY STATE4 2315 ret 2316ENDPROC(_aesni_dec4) 2317 2318/* 2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2320 * size_t len) 2321 */ 2322ENTRY(aesni_ecb_enc) 2323 FRAME_BEGIN 2324#ifndef __x86_64__ 2325 pushl LEN 2326 pushl KEYP 2327 pushl KLEN 2328 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2329 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2330 movl (FRAME_OFFSET+24)(%esp), INP # src 2331 movl (FRAME_OFFSET+28)(%esp), LEN # len 2332#endif 2333 test LEN, LEN # check length 2334 jz .Lecb_enc_ret 2335 mov 480(KEYP), KLEN 2336 cmp $16, LEN 2337 jb .Lecb_enc_ret 2338 cmp $64, LEN 2339 jb .Lecb_enc_loop1 2340.align 4 2341.Lecb_enc_loop4: 2342 movups (INP), STATE1 2343 movups 0x10(INP), STATE2 2344 movups 0x20(INP), STATE3 2345 movups 0x30(INP), STATE4 2346 call _aesni_enc4 2347 movups STATE1, (OUTP) 2348 movups STATE2, 0x10(OUTP) 2349 movups STATE3, 0x20(OUTP) 2350 movups STATE4, 0x30(OUTP) 2351 sub $64, LEN 2352 add $64, INP 2353 add $64, OUTP 2354 cmp $64, LEN 2355 jge .Lecb_enc_loop4 2356 cmp $16, LEN 2357 jb .Lecb_enc_ret 2358.align 4 2359.Lecb_enc_loop1: 2360 movups (INP), STATE1 2361 call _aesni_enc1 2362 movups STATE1, (OUTP) 2363 sub $16, LEN 2364 add $16, INP 2365 add $16, OUTP 2366 cmp $16, LEN 2367 jge .Lecb_enc_loop1 2368.Lecb_enc_ret: 2369#ifndef __x86_64__ 2370 popl KLEN 2371 popl KEYP 2372 popl LEN 2373#endif 2374 FRAME_END 2375 ret 2376ENDPROC(aesni_ecb_enc) 2377 2378/* 2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2380 * size_t len); 2381 */ 2382ENTRY(aesni_ecb_dec) 2383 FRAME_BEGIN 2384#ifndef __x86_64__ 2385 pushl LEN 2386 pushl KEYP 2387 pushl KLEN 2388 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2389 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2390 movl (FRAME_OFFSET+24)(%esp), INP # src 2391 movl (FRAME_OFFSET+28)(%esp), LEN # len 2392#endif 2393 test LEN, LEN 2394 jz .Lecb_dec_ret 2395 mov 480(KEYP), KLEN 2396 add $240, KEYP 2397 cmp $16, LEN 2398 jb .Lecb_dec_ret 2399 cmp $64, LEN 2400 jb .Lecb_dec_loop1 2401.align 4 2402.Lecb_dec_loop4: 2403 movups (INP), STATE1 2404 movups 0x10(INP), STATE2 2405 movups 0x20(INP), STATE3 2406 movups 0x30(INP), STATE4 2407 call _aesni_dec4 2408 movups STATE1, (OUTP) 2409 movups STATE2, 0x10(OUTP) 2410 movups STATE3, 0x20(OUTP) 2411 movups STATE4, 0x30(OUTP) 2412 sub $64, LEN 2413 add $64, INP 2414 add $64, OUTP 2415 cmp $64, LEN 2416 jge .Lecb_dec_loop4 2417 cmp $16, LEN 2418 jb .Lecb_dec_ret 2419.align 4 2420.Lecb_dec_loop1: 2421 movups (INP), STATE1 2422 call _aesni_dec1 2423 movups STATE1, (OUTP) 2424 sub $16, LEN 2425 add $16, INP 2426 add $16, OUTP 2427 cmp $16, LEN 2428 jge .Lecb_dec_loop1 2429.Lecb_dec_ret: 2430#ifndef __x86_64__ 2431 popl KLEN 2432 popl KEYP 2433 popl LEN 2434#endif 2435 FRAME_END 2436 ret 2437ENDPROC(aesni_ecb_dec) 2438 2439/* 2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2441 * size_t len, u8 *iv) 2442 */ 2443ENTRY(aesni_cbc_enc) 2444 FRAME_BEGIN 2445#ifndef __x86_64__ 2446 pushl IVP 2447 pushl LEN 2448 pushl KEYP 2449 pushl KLEN 2450 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2451 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2452 movl (FRAME_OFFSET+28)(%esp), INP # src 2453 movl (FRAME_OFFSET+32)(%esp), LEN # len 2454 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2455#endif 2456 cmp $16, LEN 2457 jb .Lcbc_enc_ret 2458 mov 480(KEYP), KLEN 2459 movups (IVP), STATE # load iv as initial state 2460.align 4 2461.Lcbc_enc_loop: 2462 movups (INP), IN # load input 2463 pxor IN, STATE 2464 call _aesni_enc1 2465 movups STATE, (OUTP) # store output 2466 sub $16, LEN 2467 add $16, INP 2468 add $16, OUTP 2469 cmp $16, LEN 2470 jge .Lcbc_enc_loop 2471 movups STATE, (IVP) 2472.Lcbc_enc_ret: 2473#ifndef __x86_64__ 2474 popl KLEN 2475 popl KEYP 2476 popl LEN 2477 popl IVP 2478#endif 2479 FRAME_END 2480 ret 2481ENDPROC(aesni_cbc_enc) 2482 2483/* 2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2485 * size_t len, u8 *iv) 2486 */ 2487ENTRY(aesni_cbc_dec) 2488 FRAME_BEGIN 2489#ifndef __x86_64__ 2490 pushl IVP 2491 pushl LEN 2492 pushl KEYP 2493 pushl KLEN 2494 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2495 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2496 movl (FRAME_OFFSET+28)(%esp), INP # src 2497 movl (FRAME_OFFSET+32)(%esp), LEN # len 2498 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2499#endif 2500 cmp $16, LEN 2501 jb .Lcbc_dec_just_ret 2502 mov 480(KEYP), KLEN 2503 add $240, KEYP 2504 movups (IVP), IV 2505 cmp $64, LEN 2506 jb .Lcbc_dec_loop1 2507.align 4 2508.Lcbc_dec_loop4: 2509 movups (INP), IN1 2510 movaps IN1, STATE1 2511 movups 0x10(INP), IN2 2512 movaps IN2, STATE2 2513#ifdef __x86_64__ 2514 movups 0x20(INP), IN3 2515 movaps IN3, STATE3 2516 movups 0x30(INP), IN4 2517 movaps IN4, STATE4 2518#else 2519 movups 0x20(INP), IN1 2520 movaps IN1, STATE3 2521 movups 0x30(INP), IN2 2522 movaps IN2, STATE4 2523#endif 2524 call _aesni_dec4 2525 pxor IV, STATE1 2526#ifdef __x86_64__ 2527 pxor IN1, STATE2 2528 pxor IN2, STATE3 2529 pxor IN3, STATE4 2530 movaps IN4, IV 2531#else 2532 pxor IN1, STATE4 2533 movaps IN2, IV 2534 movups (INP), IN1 2535 pxor IN1, STATE2 2536 movups 0x10(INP), IN2 2537 pxor IN2, STATE3 2538#endif 2539 movups STATE1, (OUTP) 2540 movups STATE2, 0x10(OUTP) 2541 movups STATE3, 0x20(OUTP) 2542 movups STATE4, 0x30(OUTP) 2543 sub $64, LEN 2544 add $64, INP 2545 add $64, OUTP 2546 cmp $64, LEN 2547 jge .Lcbc_dec_loop4 2548 cmp $16, LEN 2549 jb .Lcbc_dec_ret 2550.align 4 2551.Lcbc_dec_loop1: 2552 movups (INP), IN 2553 movaps IN, STATE 2554 call _aesni_dec1 2555 pxor IV, STATE 2556 movups STATE, (OUTP) 2557 movaps IN, IV 2558 sub $16, LEN 2559 add $16, INP 2560 add $16, OUTP 2561 cmp $16, LEN 2562 jge .Lcbc_dec_loop1 2563.Lcbc_dec_ret: 2564 movups IV, (IVP) 2565.Lcbc_dec_just_ret: 2566#ifndef __x86_64__ 2567 popl KLEN 2568 popl KEYP 2569 popl LEN 2570 popl IVP 2571#endif 2572 FRAME_END 2573 ret 2574ENDPROC(aesni_cbc_dec) 2575 2576#ifdef __x86_64__ 2577.pushsection .rodata 2578.align 16 2579.Lbswap_mask: 2580 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2581.popsection 2582 2583/* 2584 * _aesni_inc_init: internal ABI 2585 * setup registers used by _aesni_inc 2586 * input: 2587 * IV 2588 * output: 2589 * CTR: == IV, in little endian 2590 * TCTR_LOW: == lower qword of CTR 2591 * INC: == 1, in little endian 2592 * BSWAP_MASK == endian swapping mask 2593 */ 2594.align 4 2595_aesni_inc_init: 2596 movaps .Lbswap_mask, BSWAP_MASK 2597 movaps IV, CTR 2598 PSHUFB_XMM BSWAP_MASK CTR 2599 mov $1, TCTR_LOW 2600 MOVQ_R64_XMM TCTR_LOW INC 2601 MOVQ_R64_XMM CTR TCTR_LOW 2602 ret 2603ENDPROC(_aesni_inc_init) 2604 2605/* 2606 * _aesni_inc: internal ABI 2607 * Increase IV by 1, IV is in big endian 2608 * input: 2609 * IV 2610 * CTR: == IV, in little endian 2611 * TCTR_LOW: == lower qword of CTR 2612 * INC: == 1, in little endian 2613 * BSWAP_MASK == endian swapping mask 2614 * output: 2615 * IV: Increase by 1 2616 * changed: 2617 * CTR: == output IV, in little endian 2618 * TCTR_LOW: == lower qword of CTR 2619 */ 2620.align 4 2621_aesni_inc: 2622 paddq INC, CTR 2623 add $1, TCTR_LOW 2624 jnc .Linc_low 2625 pslldq $8, INC 2626 paddq INC, CTR 2627 psrldq $8, INC 2628.Linc_low: 2629 movaps CTR, IV 2630 PSHUFB_XMM BSWAP_MASK IV 2631 ret 2632ENDPROC(_aesni_inc) 2633 2634/* 2635 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2636 * size_t len, u8 *iv) 2637 */ 2638ENTRY(aesni_ctr_enc) 2639 FRAME_BEGIN 2640 cmp $16, LEN 2641 jb .Lctr_enc_just_ret 2642 mov 480(KEYP), KLEN 2643 movups (IVP), IV 2644 call _aesni_inc_init 2645 cmp $64, LEN 2646 jb .Lctr_enc_loop1 2647.align 4 2648.Lctr_enc_loop4: 2649 movaps IV, STATE1 2650 call _aesni_inc 2651 movups (INP), IN1 2652 movaps IV, STATE2 2653 call _aesni_inc 2654 movups 0x10(INP), IN2 2655 movaps IV, STATE3 2656 call _aesni_inc 2657 movups 0x20(INP), IN3 2658 movaps IV, STATE4 2659 call _aesni_inc 2660 movups 0x30(INP), IN4 2661 call _aesni_enc4 2662 pxor IN1, STATE1 2663 movups STATE1, (OUTP) 2664 pxor IN2, STATE2 2665 movups STATE2, 0x10(OUTP) 2666 pxor IN3, STATE3 2667 movups STATE3, 0x20(OUTP) 2668 pxor IN4, STATE4 2669 movups STATE4, 0x30(OUTP) 2670 sub $64, LEN 2671 add $64, INP 2672 add $64, OUTP 2673 cmp $64, LEN 2674 jge .Lctr_enc_loop4 2675 cmp $16, LEN 2676 jb .Lctr_enc_ret 2677.align 4 2678.Lctr_enc_loop1: 2679 movaps IV, STATE 2680 call _aesni_inc 2681 movups (INP), IN 2682 call _aesni_enc1 2683 pxor IN, STATE 2684 movups STATE, (OUTP) 2685 sub $16, LEN 2686 add $16, INP 2687 add $16, OUTP 2688 cmp $16, LEN 2689 jge .Lctr_enc_loop1 2690.Lctr_enc_ret: 2691 movups IV, (IVP) 2692.Lctr_enc_just_ret: 2693 FRAME_END 2694 ret 2695ENDPROC(aesni_ctr_enc) 2696 2697/* 2698 * _aesni_gf128mul_x_ble: internal ABI 2699 * Multiply in GF(2^128) for XTS IVs 2700 * input: 2701 * IV: current IV 2702 * GF128MUL_MASK == mask with 0x87 and 0x01 2703 * output: 2704 * IV: next IV 2705 * changed: 2706 * CTR: == temporary value 2707 */ 2708#define _aesni_gf128mul_x_ble() \ 2709 pshufd $0x13, IV, CTR; \ 2710 paddq IV, IV; \ 2711 psrad $31, CTR; \ 2712 pand GF128MUL_MASK, CTR; \ 2713 pxor CTR, IV; 2714 2715/* 2716 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2717 * bool enc, u8 *iv) 2718 */ 2719ENTRY(aesni_xts_crypt8) 2720 FRAME_BEGIN 2721 cmpb $0, %cl 2722 movl $0, %ecx 2723 movl $240, %r10d 2724 leaq _aesni_enc4, %r11 2725 leaq _aesni_dec4, %rax 2726 cmovel %r10d, %ecx 2727 cmoveq %rax, %r11 2728 2729 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2730 movups (IVP), IV 2731 2732 mov 480(KEYP), KLEN 2733 addq %rcx, KEYP 2734 2735 movdqa IV, STATE1 2736 movdqu 0x00(INP), INC 2737 pxor INC, STATE1 2738 movdqu IV, 0x00(OUTP) 2739 2740 _aesni_gf128mul_x_ble() 2741 movdqa IV, STATE2 2742 movdqu 0x10(INP), INC 2743 pxor INC, STATE2 2744 movdqu IV, 0x10(OUTP) 2745 2746 _aesni_gf128mul_x_ble() 2747 movdqa IV, STATE3 2748 movdqu 0x20(INP), INC 2749 pxor INC, STATE3 2750 movdqu IV, 0x20(OUTP) 2751 2752 _aesni_gf128mul_x_ble() 2753 movdqa IV, STATE4 2754 movdqu 0x30(INP), INC 2755 pxor INC, STATE4 2756 movdqu IV, 0x30(OUTP) 2757 2758 call *%r11 2759 2760 movdqu 0x00(OUTP), INC 2761 pxor INC, STATE1 2762 movdqu STATE1, 0x00(OUTP) 2763 2764 _aesni_gf128mul_x_ble() 2765 movdqa IV, STATE1 2766 movdqu 0x40(INP), INC 2767 pxor INC, STATE1 2768 movdqu IV, 0x40(OUTP) 2769 2770 movdqu 0x10(OUTP), INC 2771 pxor INC, STATE2 2772 movdqu STATE2, 0x10(OUTP) 2773 2774 _aesni_gf128mul_x_ble() 2775 movdqa IV, STATE2 2776 movdqu 0x50(INP), INC 2777 pxor INC, STATE2 2778 movdqu IV, 0x50(OUTP) 2779 2780 movdqu 0x20(OUTP), INC 2781 pxor INC, STATE3 2782 movdqu STATE3, 0x20(OUTP) 2783 2784 _aesni_gf128mul_x_ble() 2785 movdqa IV, STATE3 2786 movdqu 0x60(INP), INC 2787 pxor INC, STATE3 2788 movdqu IV, 0x60(OUTP) 2789 2790 movdqu 0x30(OUTP), INC 2791 pxor INC, STATE4 2792 movdqu STATE4, 0x30(OUTP) 2793 2794 _aesni_gf128mul_x_ble() 2795 movdqa IV, STATE4 2796 movdqu 0x70(INP), INC 2797 pxor INC, STATE4 2798 movdqu IV, 0x70(OUTP) 2799 2800 _aesni_gf128mul_x_ble() 2801 movups IV, (IVP) 2802 2803 call *%r11 2804 2805 movdqu 0x40(OUTP), INC 2806 pxor INC, STATE1 2807 movdqu STATE1, 0x40(OUTP) 2808 2809 movdqu 0x50(OUTP), INC 2810 pxor INC, STATE2 2811 movdqu STATE2, 0x50(OUTP) 2812 2813 movdqu 0x60(OUTP), INC 2814 pxor INC, STATE3 2815 movdqu STATE3, 0x60(OUTP) 2816 2817 movdqu 0x70(OUTP), INC 2818 pxor INC, STATE4 2819 movdqu STATE4, 0x70(OUTP) 2820 2821 FRAME_END 2822 ret 2823ENDPROC(aesni_xts_crypt8) 2824 2825#endif 2826