1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34 35/* 36 * The following macros are used to move an (un)aligned 16 byte value to/from 37 * an XMM register. This can done for either FP or integer values, for FP use 38 * movaps (move aligned packed single) or integer use movdqa (move double quad 39 * aligned). It doesn't make a performance difference which instruction is used 40 * since Nehalem (original Core i7) was released. However, the movaps is a byte 41 * shorter, so that is the one we'll use for now. (same for unaligned). 42 */ 43#define MOVADQ movaps 44#define MOVUDQ movups 45 46#ifdef __x86_64__ 47 48.data 49.align 16 50.Lgf128mul_x_ble_mask: 51 .octa 0x00000000000000010000000000000087 52POLY: .octa 0xC2000000000000000000000000000001 53TWOONE: .octa 0x00000001000000000000000000000001 54 55# order of these constants should not change. 56# more specifically, ALL_F should follow SHIFT_MASK, 57# and ZERO should follow ALL_F 58 59SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 60MASK1: .octa 0x0000000000000000ffffffffffffffff 61MASK2: .octa 0xffffffffffffffff0000000000000000 62SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 63ALL_F: .octa 0xffffffffffffffffffffffffffffffff 64ZERO: .octa 0x00000000000000000000000000000000 65ONE: .octa 0x00000000000000000000000000000001 66F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 67dec: .octa 0x1 68enc: .octa 0x2 69 70 71.text 72 73 74#define STACK_OFFSET 8*3 75#define HashKey 16*0 // store HashKey <<1 mod poly here 76#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 77#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 78#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 79#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 80 // bits of HashKey <<1 mod poly here 81 //(for Karatsuba purposes) 82#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 83 // bits of HashKey^2 <<1 mod poly here 84 // (for Karatsuba purposes) 85#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 86 // bits of HashKey^3 <<1 mod poly here 87 // (for Karatsuba purposes) 88#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 89 // bits of HashKey^4 <<1 mod poly here 90 // (for Karatsuba purposes) 91#define VARIABLE_OFFSET 16*8 92 93#define arg1 rdi 94#define arg2 rsi 95#define arg3 rdx 96#define arg4 rcx 97#define arg5 r8 98#define arg6 r9 99#define arg7 STACK_OFFSET+8(%r14) 100#define arg8 STACK_OFFSET+16(%r14) 101#define arg9 STACK_OFFSET+24(%r14) 102#define arg10 STACK_OFFSET+32(%r14) 103#define keysize 2*15*16(%arg1) 104#endif 105 106 107#define STATE1 %xmm0 108#define STATE2 %xmm4 109#define STATE3 %xmm5 110#define STATE4 %xmm6 111#define STATE STATE1 112#define IN1 %xmm1 113#define IN2 %xmm7 114#define IN3 %xmm8 115#define IN4 %xmm9 116#define IN IN1 117#define KEY %xmm2 118#define IV %xmm3 119 120#define BSWAP_MASK %xmm10 121#define CTR %xmm11 122#define INC %xmm12 123 124#define GF128MUL_MASK %xmm10 125 126#ifdef __x86_64__ 127#define AREG %rax 128#define KEYP %rdi 129#define OUTP %rsi 130#define UKEYP OUTP 131#define INP %rdx 132#define LEN %rcx 133#define IVP %r8 134#define KLEN %r9d 135#define T1 %r10 136#define TKEYP T1 137#define T2 %r11 138#define TCTR_LOW T2 139#else 140#define AREG %eax 141#define KEYP %edi 142#define OUTP AREG 143#define UKEYP OUTP 144#define INP %edx 145#define LEN %esi 146#define IVP %ebp 147#define KLEN %ebx 148#define T1 %ecx 149#define TKEYP T1 150#endif 151 152 153#ifdef __x86_64__ 154/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 155* 156* 157* Input: A and B (128-bits each, bit-reflected) 158* Output: C = A*B*x mod poly, (i.e. >>1 ) 159* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 160* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 161* 162*/ 163.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 164 movdqa \GH, \TMP1 165 pshufd $78, \GH, \TMP2 166 pshufd $78, \HK, \TMP3 167 pxor \GH, \TMP2 # TMP2 = a1+a0 168 pxor \HK, \TMP3 # TMP3 = b1+b0 169 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 170 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 171 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 172 pxor \GH, \TMP2 173 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 174 movdqa \TMP2, \TMP3 175 pslldq $8, \TMP3 # left shift TMP3 2 DWs 176 psrldq $8, \TMP2 # right shift TMP2 2 DWs 177 pxor \TMP3, \GH 178 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 179 180 # first phase of the reduction 181 182 movdqa \GH, \TMP2 183 movdqa \GH, \TMP3 184 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 185 # in in order to perform 186 # independent shifts 187 pslld $31, \TMP2 # packed right shift <<31 188 pslld $30, \TMP3 # packed right shift <<30 189 pslld $25, \TMP4 # packed right shift <<25 190 pxor \TMP3, \TMP2 # xor the shifted versions 191 pxor \TMP4, \TMP2 192 movdqa \TMP2, \TMP5 193 psrldq $4, \TMP5 # right shift TMP5 1 DW 194 pslldq $12, \TMP2 # left shift TMP2 3 DWs 195 pxor \TMP2, \GH 196 197 # second phase of the reduction 198 199 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 200 # in in order to perform 201 # independent shifts 202 movdqa \GH,\TMP3 203 movdqa \GH,\TMP4 204 psrld $1,\TMP2 # packed left shift >>1 205 psrld $2,\TMP3 # packed left shift >>2 206 psrld $7,\TMP4 # packed left shift >>7 207 pxor \TMP3,\TMP2 # xor the shifted versions 208 pxor \TMP4,\TMP2 209 pxor \TMP5, \TMP2 210 pxor \TMP2, \GH 211 pxor \TMP1, \GH # result is in TMP1 212.endm 213 214/* 215* if a = number of total plaintext bytes 216* b = floor(a/16) 217* num_initial_blocks = b mod 4 218* encrypt the initial num_initial_blocks blocks and apply ghash on 219* the ciphertext 220* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 221* are clobbered 222* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 223*/ 224 225 226.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 227XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 228 MOVADQ SHUF_MASK(%rip), %xmm14 229 mov arg7, %r10 # %r10 = AAD 230 mov arg8, %r12 # %r12 = aadLen 231 mov %r12, %r11 232 pxor %xmm\i, %xmm\i 233 234_get_AAD_loop\num_initial_blocks\operation: 235 movd (%r10), \TMP1 236 pslldq $12, \TMP1 237 psrldq $4, %xmm\i 238 pxor \TMP1, %xmm\i 239 add $4, %r10 240 sub $4, %r12 241 jne _get_AAD_loop\num_initial_blocks\operation 242 243 cmp $16, %r11 244 je _get_AAD_loop2_done\num_initial_blocks\operation 245 246 mov $16, %r12 247_get_AAD_loop2\num_initial_blocks\operation: 248 psrldq $4, %xmm\i 249 sub $4, %r12 250 cmp %r11, %r12 251 jne _get_AAD_loop2\num_initial_blocks\operation 252 253_get_AAD_loop2_done\num_initial_blocks\operation: 254 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 255 256 xor %r11, %r11 # initialise the data pointer offset as zero 257 258 # start AES for num_initial_blocks blocks 259 260 mov %arg5, %rax # %rax = *Y0 261 movdqu (%rax), \XMM0 # XMM0 = Y0 262 PSHUFB_XMM %xmm14, \XMM0 263 264.if (\i == 5) || (\i == 6) || (\i == 7) 265 MOVADQ ONE(%RIP),\TMP1 266 MOVADQ (%arg1),\TMP2 267.irpc index, \i_seq 268 paddd \TMP1, \XMM0 # INCR Y0 269 movdqa \XMM0, %xmm\index 270 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 271 pxor \TMP2, %xmm\index 272.endr 273 lea 0x10(%arg1),%r10 274 mov keysize,%eax 275 shr $2,%eax # 128->4, 192->6, 256->8 276 add $5,%eax # 128->9, 192->11, 256->13 277 278aes_loop_initial_dec\num_initial_blocks: 279 MOVADQ (%r10),\TMP1 280.irpc index, \i_seq 281 AESENC \TMP1, %xmm\index 282.endr 283 add $16,%r10 284 sub $1,%eax 285 jnz aes_loop_initial_dec\num_initial_blocks 286 287 MOVADQ (%r10), \TMP1 288.irpc index, \i_seq 289 AESENCLAST \TMP1, %xmm\index # Last Round 290.endr 291.irpc index, \i_seq 292 movdqu (%arg3 , %r11, 1), \TMP1 293 pxor \TMP1, %xmm\index 294 movdqu %xmm\index, (%arg2 , %r11, 1) 295 # write back plaintext/ciphertext for num_initial_blocks 296 add $16, %r11 297 298 movdqa \TMP1, %xmm\index 299 PSHUFB_XMM %xmm14, %xmm\index 300 # prepare plaintext/ciphertext for GHASH computation 301.endr 302.endif 303 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 304 # apply GHASH on num_initial_blocks blocks 305 306.if \i == 5 307 pxor %xmm5, %xmm6 308 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 309 pxor %xmm6, %xmm7 310 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 311 pxor %xmm7, %xmm8 312 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 313.elseif \i == 6 314 pxor %xmm6, %xmm7 315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 316 pxor %xmm7, %xmm8 317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 318.elseif \i == 7 319 pxor %xmm7, %xmm8 320 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 321.endif 322 cmp $64, %r13 323 jl _initial_blocks_done\num_initial_blocks\operation 324 # no need for precomputed values 325/* 326* 327* Precomputations for HashKey parallel with encryption of first 4 blocks. 328* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 329*/ 330 MOVADQ ONE(%rip), \TMP1 331 paddd \TMP1, \XMM0 # INCR Y0 332 MOVADQ \XMM0, \XMM1 333 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 334 335 paddd \TMP1, \XMM0 # INCR Y0 336 MOVADQ \XMM0, \XMM2 337 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 338 339 paddd \TMP1, \XMM0 # INCR Y0 340 MOVADQ \XMM0, \XMM3 341 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 342 343 paddd \TMP1, \XMM0 # INCR Y0 344 MOVADQ \XMM0, \XMM4 345 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 346 347 MOVADQ 0(%arg1),\TMP1 348 pxor \TMP1, \XMM1 349 pxor \TMP1, \XMM2 350 pxor \TMP1, \XMM3 351 pxor \TMP1, \XMM4 352 movdqa \TMP3, \TMP5 353 pshufd $78, \TMP3, \TMP1 354 pxor \TMP3, \TMP1 355 movdqa \TMP1, HashKey_k(%rsp) 356 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 357# TMP5 = HashKey^2<<1 (mod poly) 358 movdqa \TMP5, HashKey_2(%rsp) 359# HashKey_2 = HashKey^2<<1 (mod poly) 360 pshufd $78, \TMP5, \TMP1 361 pxor \TMP5, \TMP1 362 movdqa \TMP1, HashKey_2_k(%rsp) 363.irpc index, 1234 # do 4 rounds 364 movaps 0x10*\index(%arg1), \TMP1 365 AESENC \TMP1, \XMM1 366 AESENC \TMP1, \XMM2 367 AESENC \TMP1, \XMM3 368 AESENC \TMP1, \XMM4 369.endr 370 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 371# TMP5 = HashKey^3<<1 (mod poly) 372 movdqa \TMP5, HashKey_3(%rsp) 373 pshufd $78, \TMP5, \TMP1 374 pxor \TMP5, \TMP1 375 movdqa \TMP1, HashKey_3_k(%rsp) 376.irpc index, 56789 # do next 5 rounds 377 movaps 0x10*\index(%arg1), \TMP1 378 AESENC \TMP1, \XMM1 379 AESENC \TMP1, \XMM2 380 AESENC \TMP1, \XMM3 381 AESENC \TMP1, \XMM4 382.endr 383 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 384# TMP5 = HashKey^3<<1 (mod poly) 385 movdqa \TMP5, HashKey_4(%rsp) 386 pshufd $78, \TMP5, \TMP1 387 pxor \TMP5, \TMP1 388 movdqa \TMP1, HashKey_4_k(%rsp) 389 lea 0xa0(%arg1),%r10 390 mov keysize,%eax 391 shr $2,%eax # 128->4, 192->6, 256->8 392 sub $4,%eax # 128->0, 192->2, 256->4 393 jz aes_loop_pre_dec_done\num_initial_blocks 394 395aes_loop_pre_dec\num_initial_blocks: 396 MOVADQ (%r10),\TMP2 397.irpc index, 1234 398 AESENC \TMP2, %xmm\index 399.endr 400 add $16,%r10 401 sub $1,%eax 402 jnz aes_loop_pre_dec\num_initial_blocks 403 404aes_loop_pre_dec_done\num_initial_blocks: 405 MOVADQ (%r10), \TMP2 406 AESENCLAST \TMP2, \XMM1 407 AESENCLAST \TMP2, \XMM2 408 AESENCLAST \TMP2, \XMM3 409 AESENCLAST \TMP2, \XMM4 410 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 411 pxor \TMP1, \XMM1 412 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 413 movdqa \TMP1, \XMM1 414 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 415 pxor \TMP1, \XMM2 416 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 417 movdqa \TMP1, \XMM2 418 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 419 pxor \TMP1, \XMM3 420 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 421 movdqa \TMP1, \XMM3 422 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 423 pxor \TMP1, \XMM4 424 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 425 movdqa \TMP1, \XMM4 426 add $64, %r11 427 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 428 pxor \XMMDst, \XMM1 429# combine GHASHed value with the corresponding ciphertext 430 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 432 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 433 434_initial_blocks_done\num_initial_blocks\operation: 435 436.endm 437 438 439/* 440* if a = number of total plaintext bytes 441* b = floor(a/16) 442* num_initial_blocks = b mod 4 443* encrypt the initial num_initial_blocks blocks and apply ghash on 444* the ciphertext 445* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 446* are clobbered 447* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 448*/ 449 450 451.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 452XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 453 MOVADQ SHUF_MASK(%rip), %xmm14 454 mov arg7, %r10 # %r10 = AAD 455 mov arg8, %r12 # %r12 = aadLen 456 mov %r12, %r11 457 pxor %xmm\i, %xmm\i 458_get_AAD_loop\num_initial_blocks\operation: 459 movd (%r10), \TMP1 460 pslldq $12, \TMP1 461 psrldq $4, %xmm\i 462 pxor \TMP1, %xmm\i 463 add $4, %r10 464 sub $4, %r12 465 jne _get_AAD_loop\num_initial_blocks\operation 466 cmp $16, %r11 467 je _get_AAD_loop2_done\num_initial_blocks\operation 468 mov $16, %r12 469_get_AAD_loop2\num_initial_blocks\operation: 470 psrldq $4, %xmm\i 471 sub $4, %r12 472 cmp %r11, %r12 473 jne _get_AAD_loop2\num_initial_blocks\operation 474_get_AAD_loop2_done\num_initial_blocks\operation: 475 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 476 477 xor %r11, %r11 # initialise the data pointer offset as zero 478 479 # start AES for num_initial_blocks blocks 480 481 mov %arg5, %rax # %rax = *Y0 482 movdqu (%rax), \XMM0 # XMM0 = Y0 483 PSHUFB_XMM %xmm14, \XMM0 484 485.if (\i == 5) || (\i == 6) || (\i == 7) 486 487 MOVADQ ONE(%RIP),\TMP1 488 MOVADQ 0(%arg1),\TMP2 489.irpc index, \i_seq 490 paddd \TMP1, \XMM0 # INCR Y0 491 MOVADQ \XMM0, %xmm\index 492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 493 pxor \TMP2, %xmm\index 494.endr 495 lea 0x10(%arg1),%r10 496 mov keysize,%eax 497 shr $2,%eax # 128->4, 192->6, 256->8 498 add $5,%eax # 128->9, 192->11, 256->13 499 500aes_loop_initial_enc\num_initial_blocks: 501 MOVADQ (%r10),\TMP1 502.irpc index, \i_seq 503 AESENC \TMP1, %xmm\index 504.endr 505 add $16,%r10 506 sub $1,%eax 507 jnz aes_loop_initial_enc\num_initial_blocks 508 509 MOVADQ (%r10), \TMP1 510.irpc index, \i_seq 511 AESENCLAST \TMP1, %xmm\index # Last Round 512.endr 513.irpc index, \i_seq 514 movdqu (%arg3 , %r11, 1), \TMP1 515 pxor \TMP1, %xmm\index 516 movdqu %xmm\index, (%arg2 , %r11, 1) 517 # write back plaintext/ciphertext for num_initial_blocks 518 add $16, %r11 519 PSHUFB_XMM %xmm14, %xmm\index 520 521 # prepare plaintext/ciphertext for GHASH computation 522.endr 523.endif 524 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 525 # apply GHASH on num_initial_blocks blocks 526 527.if \i == 5 528 pxor %xmm5, %xmm6 529 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 530 pxor %xmm6, %xmm7 531 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 532 pxor %xmm7, %xmm8 533 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 534.elseif \i == 6 535 pxor %xmm6, %xmm7 536 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 537 pxor %xmm7, %xmm8 538 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 539.elseif \i == 7 540 pxor %xmm7, %xmm8 541 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 542.endif 543 cmp $64, %r13 544 jl _initial_blocks_done\num_initial_blocks\operation 545 # no need for precomputed values 546/* 547* 548* Precomputations for HashKey parallel with encryption of first 4 blocks. 549* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 550*/ 551 MOVADQ ONE(%RIP),\TMP1 552 paddd \TMP1, \XMM0 # INCR Y0 553 MOVADQ \XMM0, \XMM1 554 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 555 556 paddd \TMP1, \XMM0 # INCR Y0 557 MOVADQ \XMM0, \XMM2 558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 559 560 paddd \TMP1, \XMM0 # INCR Y0 561 MOVADQ \XMM0, \XMM3 562 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 563 564 paddd \TMP1, \XMM0 # INCR Y0 565 MOVADQ \XMM0, \XMM4 566 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 567 568 MOVADQ 0(%arg1),\TMP1 569 pxor \TMP1, \XMM1 570 pxor \TMP1, \XMM2 571 pxor \TMP1, \XMM3 572 pxor \TMP1, \XMM4 573 movdqa \TMP3, \TMP5 574 pshufd $78, \TMP3, \TMP1 575 pxor \TMP3, \TMP1 576 movdqa \TMP1, HashKey_k(%rsp) 577 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 578# TMP5 = HashKey^2<<1 (mod poly) 579 movdqa \TMP5, HashKey_2(%rsp) 580# HashKey_2 = HashKey^2<<1 (mod poly) 581 pshufd $78, \TMP5, \TMP1 582 pxor \TMP5, \TMP1 583 movdqa \TMP1, HashKey_2_k(%rsp) 584.irpc index, 1234 # do 4 rounds 585 movaps 0x10*\index(%arg1), \TMP1 586 AESENC \TMP1, \XMM1 587 AESENC \TMP1, \XMM2 588 AESENC \TMP1, \XMM3 589 AESENC \TMP1, \XMM4 590.endr 591 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 592# TMP5 = HashKey^3<<1 (mod poly) 593 movdqa \TMP5, HashKey_3(%rsp) 594 pshufd $78, \TMP5, \TMP1 595 pxor \TMP5, \TMP1 596 movdqa \TMP1, HashKey_3_k(%rsp) 597.irpc index, 56789 # do next 5 rounds 598 movaps 0x10*\index(%arg1), \TMP1 599 AESENC \TMP1, \XMM1 600 AESENC \TMP1, \XMM2 601 AESENC \TMP1, \XMM3 602 AESENC \TMP1, \XMM4 603.endr 604 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 605# TMP5 = HashKey^3<<1 (mod poly) 606 movdqa \TMP5, HashKey_4(%rsp) 607 pshufd $78, \TMP5, \TMP1 608 pxor \TMP5, \TMP1 609 movdqa \TMP1, HashKey_4_k(%rsp) 610 lea 0xa0(%arg1),%r10 611 mov keysize,%eax 612 shr $2,%eax # 128->4, 192->6, 256->8 613 sub $4,%eax # 128->0, 192->2, 256->4 614 jz aes_loop_pre_enc_done\num_initial_blocks 615 616aes_loop_pre_enc\num_initial_blocks: 617 MOVADQ (%r10),\TMP2 618.irpc index, 1234 619 AESENC \TMP2, %xmm\index 620.endr 621 add $16,%r10 622 sub $1,%eax 623 jnz aes_loop_pre_enc\num_initial_blocks 624 625aes_loop_pre_enc_done\num_initial_blocks: 626 MOVADQ (%r10), \TMP2 627 AESENCLAST \TMP2, \XMM1 628 AESENCLAST \TMP2, \XMM2 629 AESENCLAST \TMP2, \XMM3 630 AESENCLAST \TMP2, \XMM4 631 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 632 pxor \TMP1, \XMM1 633 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 634 pxor \TMP1, \XMM2 635 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 636 pxor \TMP1, \XMM3 637 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 638 pxor \TMP1, \XMM4 639 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 640 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 641 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 642 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 643 644 add $64, %r11 645 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 646 pxor \XMMDst, \XMM1 647# combine GHASHed value with the corresponding ciphertext 648 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 649 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 650 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 651 652_initial_blocks_done\num_initial_blocks\operation: 653 654.endm 655 656/* 657* encrypt 4 blocks at a time 658* ghash the 4 previously encrypted ciphertext blocks 659* arg1, %arg2, %arg3 are used as pointers only, not modified 660* %r11 is the data offset value 661*/ 662.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 663TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 664 665 movdqa \XMM1, \XMM5 666 movdqa \XMM2, \XMM6 667 movdqa \XMM3, \XMM7 668 movdqa \XMM4, \XMM8 669 670 movdqa SHUF_MASK(%rip), %xmm15 671 # multiply TMP5 * HashKey using karatsuba 672 673 movdqa \XMM5, \TMP4 674 pshufd $78, \XMM5, \TMP6 675 pxor \XMM5, \TMP6 676 paddd ONE(%rip), \XMM0 # INCR CNT 677 movdqa HashKey_4(%rsp), \TMP5 678 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 679 movdqa \XMM0, \XMM1 680 paddd ONE(%rip), \XMM0 # INCR CNT 681 movdqa \XMM0, \XMM2 682 paddd ONE(%rip), \XMM0 # INCR CNT 683 movdqa \XMM0, \XMM3 684 paddd ONE(%rip), \XMM0 # INCR CNT 685 movdqa \XMM0, \XMM4 686 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 687 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 688 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 689 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 690 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 691 692 pxor (%arg1), \XMM1 693 pxor (%arg1), \XMM2 694 pxor (%arg1), \XMM3 695 pxor (%arg1), \XMM4 696 movdqa HashKey_4_k(%rsp), \TMP5 697 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 698 movaps 0x10(%arg1), \TMP1 699 AESENC \TMP1, \XMM1 # Round 1 700 AESENC \TMP1, \XMM2 701 AESENC \TMP1, \XMM3 702 AESENC \TMP1, \XMM4 703 movaps 0x20(%arg1), \TMP1 704 AESENC \TMP1, \XMM1 # Round 2 705 AESENC \TMP1, \XMM2 706 AESENC \TMP1, \XMM3 707 AESENC \TMP1, \XMM4 708 movdqa \XMM6, \TMP1 709 pshufd $78, \XMM6, \TMP2 710 pxor \XMM6, \TMP2 711 movdqa HashKey_3(%rsp), \TMP5 712 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 713 movaps 0x30(%arg1), \TMP3 714 AESENC \TMP3, \XMM1 # Round 3 715 AESENC \TMP3, \XMM2 716 AESENC \TMP3, \XMM3 717 AESENC \TMP3, \XMM4 718 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 719 movaps 0x40(%arg1), \TMP3 720 AESENC \TMP3, \XMM1 # Round 4 721 AESENC \TMP3, \XMM2 722 AESENC \TMP3, \XMM3 723 AESENC \TMP3, \XMM4 724 movdqa HashKey_3_k(%rsp), \TMP5 725 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 726 movaps 0x50(%arg1), \TMP3 727 AESENC \TMP3, \XMM1 # Round 5 728 AESENC \TMP3, \XMM2 729 AESENC \TMP3, \XMM3 730 AESENC \TMP3, \XMM4 731 pxor \TMP1, \TMP4 732# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 733 pxor \XMM6, \XMM5 734 pxor \TMP2, \TMP6 735 movdqa \XMM7, \TMP1 736 pshufd $78, \XMM7, \TMP2 737 pxor \XMM7, \TMP2 738 movdqa HashKey_2(%rsp ), \TMP5 739 740 # Multiply TMP5 * HashKey using karatsuba 741 742 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 743 movaps 0x60(%arg1), \TMP3 744 AESENC \TMP3, \XMM1 # Round 6 745 AESENC \TMP3, \XMM2 746 AESENC \TMP3, \XMM3 747 AESENC \TMP3, \XMM4 748 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 749 movaps 0x70(%arg1), \TMP3 750 AESENC \TMP3, \XMM1 # Round 7 751 AESENC \TMP3, \XMM2 752 AESENC \TMP3, \XMM3 753 AESENC \TMP3, \XMM4 754 movdqa HashKey_2_k(%rsp), \TMP5 755 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 756 movaps 0x80(%arg1), \TMP3 757 AESENC \TMP3, \XMM1 # Round 8 758 AESENC \TMP3, \XMM2 759 AESENC \TMP3, \XMM3 760 AESENC \TMP3, \XMM4 761 pxor \TMP1, \TMP4 762# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 763 pxor \XMM7, \XMM5 764 pxor \TMP2, \TMP6 765 766 # Multiply XMM8 * HashKey 767 # XMM8 and TMP5 hold the values for the two operands 768 769 movdqa \XMM8, \TMP1 770 pshufd $78, \XMM8, \TMP2 771 pxor \XMM8, \TMP2 772 movdqa HashKey(%rsp), \TMP5 773 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 774 movaps 0x90(%arg1), \TMP3 775 AESENC \TMP3, \XMM1 # Round 9 776 AESENC \TMP3, \XMM2 777 AESENC \TMP3, \XMM3 778 AESENC \TMP3, \XMM4 779 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 780 lea 0xa0(%arg1),%r10 781 mov keysize,%eax 782 shr $2,%eax # 128->4, 192->6, 256->8 783 sub $4,%eax # 128->0, 192->2, 256->4 784 jz aes_loop_par_enc_done 785 786aes_loop_par_enc: 787 MOVADQ (%r10),\TMP3 788.irpc index, 1234 789 AESENC \TMP3, %xmm\index 790.endr 791 add $16,%r10 792 sub $1,%eax 793 jnz aes_loop_par_enc 794 795aes_loop_par_enc_done: 796 MOVADQ (%r10), \TMP3 797 AESENCLAST \TMP3, \XMM1 # Round 10 798 AESENCLAST \TMP3, \XMM2 799 AESENCLAST \TMP3, \XMM3 800 AESENCLAST \TMP3, \XMM4 801 movdqa HashKey_k(%rsp), \TMP5 802 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 803 movdqu (%arg3,%r11,1), \TMP3 804 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 805 movdqu 16(%arg3,%r11,1), \TMP3 806 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 807 movdqu 32(%arg3,%r11,1), \TMP3 808 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 809 movdqu 48(%arg3,%r11,1), \TMP3 810 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 811 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 812 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 813 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 814 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 815 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 816 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 817 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 818 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 819 820 pxor \TMP4, \TMP1 821 pxor \XMM8, \XMM5 822 pxor \TMP6, \TMP2 823 pxor \TMP1, \TMP2 824 pxor \XMM5, \TMP2 825 movdqa \TMP2, \TMP3 826 pslldq $8, \TMP3 # left shift TMP3 2 DWs 827 psrldq $8, \TMP2 # right shift TMP2 2 DWs 828 pxor \TMP3, \XMM5 829 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 830 831 # first phase of reduction 832 833 movdqa \XMM5, \TMP2 834 movdqa \XMM5, \TMP3 835 movdqa \XMM5, \TMP4 836# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 837 pslld $31, \TMP2 # packed right shift << 31 838 pslld $30, \TMP3 # packed right shift << 30 839 pslld $25, \TMP4 # packed right shift << 25 840 pxor \TMP3, \TMP2 # xor the shifted versions 841 pxor \TMP4, \TMP2 842 movdqa \TMP2, \TMP5 843 psrldq $4, \TMP5 # right shift T5 1 DW 844 pslldq $12, \TMP2 # left shift T2 3 DWs 845 pxor \TMP2, \XMM5 846 847 # second phase of reduction 848 849 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 850 movdqa \XMM5,\TMP3 851 movdqa \XMM5,\TMP4 852 psrld $1, \TMP2 # packed left shift >>1 853 psrld $2, \TMP3 # packed left shift >>2 854 psrld $7, \TMP4 # packed left shift >>7 855 pxor \TMP3,\TMP2 # xor the shifted versions 856 pxor \TMP4,\TMP2 857 pxor \TMP5, \TMP2 858 pxor \TMP2, \XMM5 859 pxor \TMP1, \XMM5 # result is in TMP1 860 861 pxor \XMM5, \XMM1 862.endm 863 864/* 865* decrypt 4 blocks at a time 866* ghash the 4 previously decrypted ciphertext blocks 867* arg1, %arg2, %arg3 are used as pointers only, not modified 868* %r11 is the data offset value 869*/ 870.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 871TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 872 873 movdqa \XMM1, \XMM5 874 movdqa \XMM2, \XMM6 875 movdqa \XMM3, \XMM7 876 movdqa \XMM4, \XMM8 877 878 movdqa SHUF_MASK(%rip), %xmm15 879 # multiply TMP5 * HashKey using karatsuba 880 881 movdqa \XMM5, \TMP4 882 pshufd $78, \XMM5, \TMP6 883 pxor \XMM5, \TMP6 884 paddd ONE(%rip), \XMM0 # INCR CNT 885 movdqa HashKey_4(%rsp), \TMP5 886 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 887 movdqa \XMM0, \XMM1 888 paddd ONE(%rip), \XMM0 # INCR CNT 889 movdqa \XMM0, \XMM2 890 paddd ONE(%rip), \XMM0 # INCR CNT 891 movdqa \XMM0, \XMM3 892 paddd ONE(%rip), \XMM0 # INCR CNT 893 movdqa \XMM0, \XMM4 894 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 895 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 896 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 897 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 898 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 899 900 pxor (%arg1), \XMM1 901 pxor (%arg1), \XMM2 902 pxor (%arg1), \XMM3 903 pxor (%arg1), \XMM4 904 movdqa HashKey_4_k(%rsp), \TMP5 905 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 906 movaps 0x10(%arg1), \TMP1 907 AESENC \TMP1, \XMM1 # Round 1 908 AESENC \TMP1, \XMM2 909 AESENC \TMP1, \XMM3 910 AESENC \TMP1, \XMM4 911 movaps 0x20(%arg1), \TMP1 912 AESENC \TMP1, \XMM1 # Round 2 913 AESENC \TMP1, \XMM2 914 AESENC \TMP1, \XMM3 915 AESENC \TMP1, \XMM4 916 movdqa \XMM6, \TMP1 917 pshufd $78, \XMM6, \TMP2 918 pxor \XMM6, \TMP2 919 movdqa HashKey_3(%rsp), \TMP5 920 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 921 movaps 0x30(%arg1), \TMP3 922 AESENC \TMP3, \XMM1 # Round 3 923 AESENC \TMP3, \XMM2 924 AESENC \TMP3, \XMM3 925 AESENC \TMP3, \XMM4 926 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 927 movaps 0x40(%arg1), \TMP3 928 AESENC \TMP3, \XMM1 # Round 4 929 AESENC \TMP3, \XMM2 930 AESENC \TMP3, \XMM3 931 AESENC \TMP3, \XMM4 932 movdqa HashKey_3_k(%rsp), \TMP5 933 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 934 movaps 0x50(%arg1), \TMP3 935 AESENC \TMP3, \XMM1 # Round 5 936 AESENC \TMP3, \XMM2 937 AESENC \TMP3, \XMM3 938 AESENC \TMP3, \XMM4 939 pxor \TMP1, \TMP4 940# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 941 pxor \XMM6, \XMM5 942 pxor \TMP2, \TMP6 943 movdqa \XMM7, \TMP1 944 pshufd $78, \XMM7, \TMP2 945 pxor \XMM7, \TMP2 946 movdqa HashKey_2(%rsp ), \TMP5 947 948 # Multiply TMP5 * HashKey using karatsuba 949 950 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 951 movaps 0x60(%arg1), \TMP3 952 AESENC \TMP3, \XMM1 # Round 6 953 AESENC \TMP3, \XMM2 954 AESENC \TMP3, \XMM3 955 AESENC \TMP3, \XMM4 956 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 957 movaps 0x70(%arg1), \TMP3 958 AESENC \TMP3, \XMM1 # Round 7 959 AESENC \TMP3, \XMM2 960 AESENC \TMP3, \XMM3 961 AESENC \TMP3, \XMM4 962 movdqa HashKey_2_k(%rsp), \TMP5 963 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 964 movaps 0x80(%arg1), \TMP3 965 AESENC \TMP3, \XMM1 # Round 8 966 AESENC \TMP3, \XMM2 967 AESENC \TMP3, \XMM3 968 AESENC \TMP3, \XMM4 969 pxor \TMP1, \TMP4 970# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 971 pxor \XMM7, \XMM5 972 pxor \TMP2, \TMP6 973 974 # Multiply XMM8 * HashKey 975 # XMM8 and TMP5 hold the values for the two operands 976 977 movdqa \XMM8, \TMP1 978 pshufd $78, \XMM8, \TMP2 979 pxor \XMM8, \TMP2 980 movdqa HashKey(%rsp), \TMP5 981 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 982 movaps 0x90(%arg1), \TMP3 983 AESENC \TMP3, \XMM1 # Round 9 984 AESENC \TMP3, \XMM2 985 AESENC \TMP3, \XMM3 986 AESENC \TMP3, \XMM4 987 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 988 lea 0xa0(%arg1),%r10 989 mov keysize,%eax 990 shr $2,%eax # 128->4, 192->6, 256->8 991 sub $4,%eax # 128->0, 192->2, 256->4 992 jz aes_loop_par_dec_done 993 994aes_loop_par_dec: 995 MOVADQ (%r10),\TMP3 996.irpc index, 1234 997 AESENC \TMP3, %xmm\index 998.endr 999 add $16,%r10 1000 sub $1,%eax 1001 jnz aes_loop_par_dec 1002 1003aes_loop_par_dec_done: 1004 MOVADQ (%r10), \TMP3 1005 AESENCLAST \TMP3, \XMM1 # last round 1006 AESENCLAST \TMP3, \XMM2 1007 AESENCLAST \TMP3, \XMM3 1008 AESENCLAST \TMP3, \XMM4 1009 movdqa HashKey_k(%rsp), \TMP5 1010 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1011 movdqu (%arg3,%r11,1), \TMP3 1012 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1013 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1014 movdqa \TMP3, \XMM1 1015 movdqu 16(%arg3,%r11,1), \TMP3 1016 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1017 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1018 movdqa \TMP3, \XMM2 1019 movdqu 32(%arg3,%r11,1), \TMP3 1020 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1021 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1022 movdqa \TMP3, \XMM3 1023 movdqu 48(%arg3,%r11,1), \TMP3 1024 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1025 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1026 movdqa \TMP3, \XMM4 1027 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1028 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1029 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1030 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1031 1032 pxor \TMP4, \TMP1 1033 pxor \XMM8, \XMM5 1034 pxor \TMP6, \TMP2 1035 pxor \TMP1, \TMP2 1036 pxor \XMM5, \TMP2 1037 movdqa \TMP2, \TMP3 1038 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1039 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1040 pxor \TMP3, \XMM5 1041 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1042 1043 # first phase of reduction 1044 1045 movdqa \XMM5, \TMP2 1046 movdqa \XMM5, \TMP3 1047 movdqa \XMM5, \TMP4 1048# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1049 pslld $31, \TMP2 # packed right shift << 31 1050 pslld $30, \TMP3 # packed right shift << 30 1051 pslld $25, \TMP4 # packed right shift << 25 1052 pxor \TMP3, \TMP2 # xor the shifted versions 1053 pxor \TMP4, \TMP2 1054 movdqa \TMP2, \TMP5 1055 psrldq $4, \TMP5 # right shift T5 1 DW 1056 pslldq $12, \TMP2 # left shift T2 3 DWs 1057 pxor \TMP2, \XMM5 1058 1059 # second phase of reduction 1060 1061 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1062 movdqa \XMM5,\TMP3 1063 movdqa \XMM5,\TMP4 1064 psrld $1, \TMP2 # packed left shift >>1 1065 psrld $2, \TMP3 # packed left shift >>2 1066 psrld $7, \TMP4 # packed left shift >>7 1067 pxor \TMP3,\TMP2 # xor the shifted versions 1068 pxor \TMP4,\TMP2 1069 pxor \TMP5, \TMP2 1070 pxor \TMP2, \XMM5 1071 pxor \TMP1, \XMM5 # result is in TMP1 1072 1073 pxor \XMM5, \XMM1 1074.endm 1075 1076/* GHASH the last 4 ciphertext blocks. */ 1077.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1078TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1079 1080 # Multiply TMP6 * HashKey (using Karatsuba) 1081 1082 movdqa \XMM1, \TMP6 1083 pshufd $78, \XMM1, \TMP2 1084 pxor \XMM1, \TMP2 1085 movdqa HashKey_4(%rsp), \TMP5 1086 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1087 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1088 movdqa HashKey_4_k(%rsp), \TMP4 1089 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1090 movdqa \XMM1, \XMMDst 1091 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1092 1093 # Multiply TMP1 * HashKey (using Karatsuba) 1094 1095 movdqa \XMM2, \TMP1 1096 pshufd $78, \XMM2, \TMP2 1097 pxor \XMM2, \TMP2 1098 movdqa HashKey_3(%rsp), \TMP5 1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1100 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1101 movdqa HashKey_3_k(%rsp), \TMP4 1102 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1103 pxor \TMP1, \TMP6 1104 pxor \XMM2, \XMMDst 1105 pxor \TMP2, \XMM1 1106# results accumulated in TMP6, XMMDst, XMM1 1107 1108 # Multiply TMP1 * HashKey (using Karatsuba) 1109 1110 movdqa \XMM3, \TMP1 1111 pshufd $78, \XMM3, \TMP2 1112 pxor \XMM3, \TMP2 1113 movdqa HashKey_2(%rsp), \TMP5 1114 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1115 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1116 movdqa HashKey_2_k(%rsp), \TMP4 1117 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1118 pxor \TMP1, \TMP6 1119 pxor \XMM3, \XMMDst 1120 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1121 1122 # Multiply TMP1 * HashKey (using Karatsuba) 1123 movdqa \XMM4, \TMP1 1124 pshufd $78, \XMM4, \TMP2 1125 pxor \XMM4, \TMP2 1126 movdqa HashKey(%rsp), \TMP5 1127 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1128 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1129 movdqa HashKey_k(%rsp), \TMP4 1130 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1131 pxor \TMP1, \TMP6 1132 pxor \XMM4, \XMMDst 1133 pxor \XMM1, \TMP2 1134 pxor \TMP6, \TMP2 1135 pxor \XMMDst, \TMP2 1136 # middle section of the temp results combined as in karatsuba algorithm 1137 movdqa \TMP2, \TMP4 1138 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1139 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1140 pxor \TMP4, \XMMDst 1141 pxor \TMP2, \TMP6 1142# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1143 # first phase of the reduction 1144 movdqa \XMMDst, \TMP2 1145 movdqa \XMMDst, \TMP3 1146 movdqa \XMMDst, \TMP4 1147# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1148 pslld $31, \TMP2 # packed right shifting << 31 1149 pslld $30, \TMP3 # packed right shifting << 30 1150 pslld $25, \TMP4 # packed right shifting << 25 1151 pxor \TMP3, \TMP2 # xor the shifted versions 1152 pxor \TMP4, \TMP2 1153 movdqa \TMP2, \TMP7 1154 psrldq $4, \TMP7 # right shift TMP7 1 DW 1155 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1156 pxor \TMP2, \XMMDst 1157 1158 # second phase of the reduction 1159 movdqa \XMMDst, \TMP2 1160 # make 3 copies of XMMDst for doing 3 shift operations 1161 movdqa \XMMDst, \TMP3 1162 movdqa \XMMDst, \TMP4 1163 psrld $1, \TMP2 # packed left shift >> 1 1164 psrld $2, \TMP3 # packed left shift >> 2 1165 psrld $7, \TMP4 # packed left shift >> 7 1166 pxor \TMP3, \TMP2 # xor the shifted versions 1167 pxor \TMP4, \TMP2 1168 pxor \TMP7, \TMP2 1169 pxor \TMP2, \XMMDst 1170 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1171.endm 1172 1173 1174/* Encryption of a single block 1175* uses eax & r10 1176*/ 1177 1178.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1179 1180 pxor (%arg1), \XMM0 1181 mov keysize,%eax 1182 shr $2,%eax # 128->4, 192->6, 256->8 1183 add $5,%eax # 128->9, 192->11, 256->13 1184 lea 16(%arg1), %r10 # get first expanded key address 1185 1186_esb_loop_\@: 1187 MOVADQ (%r10),\TMP1 1188 AESENC \TMP1,\XMM0 1189 add $16,%r10 1190 sub $1,%eax 1191 jnz _esb_loop_\@ 1192 1193 MOVADQ (%r10),\TMP1 1194 AESENCLAST \TMP1,\XMM0 1195.endm 1196/***************************************************************************** 1197* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1198* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1199* const u8 *in, // Ciphertext input 1200* u64 plaintext_len, // Length of data in bytes for decryption. 1201* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1202* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1203* // concatenated with 0x00000001. 16-byte aligned pointer. 1204* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1205* const u8 *aad, // Additional Authentication Data (AAD) 1206* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1207* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1208* // given authentication tag and only return the plaintext if they match. 1209* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1210* // (most likely), 12 or 8. 1211* 1212* Assumptions: 1213* 1214* keys: 1215* keys are pre-expanded and aligned to 16 bytes. we are using the first 1216* set of 11 keys in the data structure void *aes_ctx 1217* 1218* iv: 1219* 0 1 2 3 1220* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1221* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1222* | Salt (From the SA) | 1223* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1224* | Initialization Vector | 1225* | (This is the sequence number from IPSec header) | 1226* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1227* | 0x1 | 1228* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1229* 1230* 1231* 1232* AAD: 1233* AAD padded to 128 bits with 0 1234* for example, assume AAD is a u32 vector 1235* 1236* if AAD is 8 bytes: 1237* AAD[3] = {A0, A1}; 1238* padded AAD in xmm register = {A1 A0 0 0} 1239* 1240* 0 1 2 3 1241* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1242* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1243* | SPI (A1) | 1244* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1245* | 32-bit Sequence Number (A0) | 1246* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1247* | 0x0 | 1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1249* 1250* AAD Format with 32-bit Sequence Number 1251* 1252* if AAD is 12 bytes: 1253* AAD[3] = {A0, A1, A2}; 1254* padded AAD in xmm register = {A2 A1 A0 0} 1255* 1256* 0 1 2 3 1257* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1258* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1259* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1260* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1261* | SPI (A2) | 1262* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1263* | 64-bit Extended Sequence Number {A1,A0} | 1264* | | 1265* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1266* | 0x0 | 1267* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1268* 1269* AAD Format with 64-bit Extended Sequence Number 1270* 1271* aadLen: 1272* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1273* The code supports 16 too but for other sizes, the code will fail. 1274* 1275* TLen: 1276* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1277* For other sizes, the code will fail. 1278* 1279* poly = x^128 + x^127 + x^126 + x^121 + 1 1280* 1281*****************************************************************************/ 1282ENTRY(aesni_gcm_dec) 1283 push %r12 1284 push %r13 1285 push %r14 1286 mov %rsp, %r14 1287/* 1288* states of %xmm registers %xmm6:%xmm15 not saved 1289* all %xmm registers are clobbered 1290*/ 1291 sub $VARIABLE_OFFSET, %rsp 1292 and $~63, %rsp # align rsp to 64 bytes 1293 mov %arg6, %r12 1294 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1295 movdqa SHUF_MASK(%rip), %xmm2 1296 PSHUFB_XMM %xmm2, %xmm13 1297 1298 1299# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1300 1301 movdqa %xmm13, %xmm2 1302 psllq $1, %xmm13 1303 psrlq $63, %xmm2 1304 movdqa %xmm2, %xmm1 1305 pslldq $8, %xmm2 1306 psrldq $8, %xmm1 1307 por %xmm2, %xmm13 1308 1309 # Reduction 1310 1311 pshufd $0x24, %xmm1, %xmm2 1312 pcmpeqd TWOONE(%rip), %xmm2 1313 pand POLY(%rip), %xmm2 1314 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1315 1316 1317 # Decrypt first few blocks 1318 1319 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1320 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1321 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1322 mov %r13, %r12 1323 and $(3<<4), %r12 1324 jz _initial_num_blocks_is_0_decrypt 1325 cmp $(2<<4), %r12 1326 jb _initial_num_blocks_is_1_decrypt 1327 je _initial_num_blocks_is_2_decrypt 1328_initial_num_blocks_is_3_decrypt: 1329 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1330%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1331 sub $48, %r13 1332 jmp _initial_blocks_decrypted 1333_initial_num_blocks_is_2_decrypt: 1334 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1335%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1336 sub $32, %r13 1337 jmp _initial_blocks_decrypted 1338_initial_num_blocks_is_1_decrypt: 1339 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1340%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1341 sub $16, %r13 1342 jmp _initial_blocks_decrypted 1343_initial_num_blocks_is_0_decrypt: 1344 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1345%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1346_initial_blocks_decrypted: 1347 cmp $0, %r13 1348 je _zero_cipher_left_decrypt 1349 sub $64, %r13 1350 je _four_cipher_left_decrypt 1351_decrypt_by_4: 1352 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1353%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1354 add $64, %r11 1355 sub $64, %r13 1356 jne _decrypt_by_4 1357_four_cipher_left_decrypt: 1358 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1359%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1360_zero_cipher_left_decrypt: 1361 mov %arg4, %r13 1362 and $15, %r13 # %r13 = arg4 (mod 16) 1363 je _multiple_of_16_bytes_decrypt 1364 1365 # Handle the last <16 byte block separately 1366 1367 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1368 movdqa SHUF_MASK(%rip), %xmm10 1369 PSHUFB_XMM %xmm10, %xmm0 1370 1371 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1372 sub $16, %r11 1373 add %r13, %r11 1374 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1375 lea SHIFT_MASK+16(%rip), %r12 1376 sub %r13, %r12 1377# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1378# (%r13 is the number of bytes in plaintext mod 16) 1379 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1380 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1381 1382 movdqa %xmm1, %xmm2 1383 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1384 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1385 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1386 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1387 pand %xmm1, %xmm2 1388 movdqa SHUF_MASK(%rip), %xmm10 1389 PSHUFB_XMM %xmm10 ,%xmm2 1390 1391 pxor %xmm2, %xmm8 1392 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1393 # GHASH computation for the last <16 byte block 1394 sub %r13, %r11 1395 add $16, %r11 1396 1397 # output %r13 bytes 1398 MOVQ_R64_XMM %xmm0, %rax 1399 cmp $8, %r13 1400 jle _less_than_8_bytes_left_decrypt 1401 mov %rax, (%arg2 , %r11, 1) 1402 add $8, %r11 1403 psrldq $8, %xmm0 1404 MOVQ_R64_XMM %xmm0, %rax 1405 sub $8, %r13 1406_less_than_8_bytes_left_decrypt: 1407 mov %al, (%arg2, %r11, 1) 1408 add $1, %r11 1409 shr $8, %rax 1410 sub $1, %r13 1411 jne _less_than_8_bytes_left_decrypt 1412_multiple_of_16_bytes_decrypt: 1413 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1414 shl $3, %r12 # convert into number of bits 1415 movd %r12d, %xmm15 # len(A) in %xmm15 1416 shl $3, %arg4 # len(C) in bits (*128) 1417 MOVQ_R64_XMM %arg4, %xmm1 1418 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1419 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1420 pxor %xmm15, %xmm8 1421 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1422 # final GHASH computation 1423 movdqa SHUF_MASK(%rip), %xmm10 1424 PSHUFB_XMM %xmm10, %xmm8 1425 1426 mov %arg5, %rax # %rax = *Y0 1427 movdqu (%rax), %xmm0 # %xmm0 = Y0 1428 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1429 pxor %xmm8, %xmm0 1430_return_T_decrypt: 1431 mov arg9, %r10 # %r10 = authTag 1432 mov arg10, %r11 # %r11 = auth_tag_len 1433 cmp $16, %r11 1434 je _T_16_decrypt 1435 cmp $12, %r11 1436 je _T_12_decrypt 1437_T_8_decrypt: 1438 MOVQ_R64_XMM %xmm0, %rax 1439 mov %rax, (%r10) 1440 jmp _return_T_done_decrypt 1441_T_12_decrypt: 1442 MOVQ_R64_XMM %xmm0, %rax 1443 mov %rax, (%r10) 1444 psrldq $8, %xmm0 1445 movd %xmm0, %eax 1446 mov %eax, 8(%r10) 1447 jmp _return_T_done_decrypt 1448_T_16_decrypt: 1449 movdqu %xmm0, (%r10) 1450_return_T_done_decrypt: 1451 mov %r14, %rsp 1452 pop %r14 1453 pop %r13 1454 pop %r12 1455 ret 1456ENDPROC(aesni_gcm_dec) 1457 1458 1459/***************************************************************************** 1460* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1461* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1462* const u8 *in, // Plaintext input 1463* u64 plaintext_len, // Length of data in bytes for encryption. 1464* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1465* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1466* // concatenated with 0x00000001. 16-byte aligned pointer. 1467* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1468* const u8 *aad, // Additional Authentication Data (AAD) 1469* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1470* u8 *auth_tag, // Authenticated Tag output. 1471* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1472* // 12 or 8. 1473* 1474* Assumptions: 1475* 1476* keys: 1477* keys are pre-expanded and aligned to 16 bytes. we are using the 1478* first set of 11 keys in the data structure void *aes_ctx 1479* 1480* 1481* iv: 1482* 0 1 2 3 1483* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1484* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1485* | Salt (From the SA) | 1486* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1487* | Initialization Vector | 1488* | (This is the sequence number from IPSec header) | 1489* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1490* | 0x1 | 1491* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1492* 1493* 1494* 1495* AAD: 1496* AAD padded to 128 bits with 0 1497* for example, assume AAD is a u32 vector 1498* 1499* if AAD is 8 bytes: 1500* AAD[3] = {A0, A1}; 1501* padded AAD in xmm register = {A1 A0 0 0} 1502* 1503* 0 1 2 3 1504* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1505* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1506* | SPI (A1) | 1507* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1508* | 32-bit Sequence Number (A0) | 1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1510* | 0x0 | 1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1512* 1513* AAD Format with 32-bit Sequence Number 1514* 1515* if AAD is 12 bytes: 1516* AAD[3] = {A0, A1, A2}; 1517* padded AAD in xmm register = {A2 A1 A0 0} 1518* 1519* 0 1 2 3 1520* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1521* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1522* | SPI (A2) | 1523* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1524* | 64-bit Extended Sequence Number {A1,A0} | 1525* | | 1526* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1527* | 0x0 | 1528* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1529* 1530* AAD Format with 64-bit Extended Sequence Number 1531* 1532* aadLen: 1533* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1534* The code supports 16 too but for other sizes, the code will fail. 1535* 1536* TLen: 1537* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1538* For other sizes, the code will fail. 1539* 1540* poly = x^128 + x^127 + x^126 + x^121 + 1 1541***************************************************************************/ 1542ENTRY(aesni_gcm_enc) 1543 push %r12 1544 push %r13 1545 push %r14 1546 mov %rsp, %r14 1547# 1548# states of %xmm registers %xmm6:%xmm15 not saved 1549# all %xmm registers are clobbered 1550# 1551 sub $VARIABLE_OFFSET, %rsp 1552 and $~63, %rsp 1553 mov %arg6, %r12 1554 movdqu (%r12), %xmm13 1555 movdqa SHUF_MASK(%rip), %xmm2 1556 PSHUFB_XMM %xmm2, %xmm13 1557 1558 1559# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1560 1561 movdqa %xmm13, %xmm2 1562 psllq $1, %xmm13 1563 psrlq $63, %xmm2 1564 movdqa %xmm2, %xmm1 1565 pslldq $8, %xmm2 1566 psrldq $8, %xmm1 1567 por %xmm2, %xmm13 1568 1569 # reduce HashKey<<1 1570 1571 pshufd $0x24, %xmm1, %xmm2 1572 pcmpeqd TWOONE(%rip), %xmm2 1573 pand POLY(%rip), %xmm2 1574 pxor %xmm2, %xmm13 1575 movdqa %xmm13, HashKey(%rsp) 1576 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1577 and $-16, %r13 1578 mov %r13, %r12 1579 1580 # Encrypt first few blocks 1581 1582 and $(3<<4), %r12 1583 jz _initial_num_blocks_is_0_encrypt 1584 cmp $(2<<4), %r12 1585 jb _initial_num_blocks_is_1_encrypt 1586 je _initial_num_blocks_is_2_encrypt 1587_initial_num_blocks_is_3_encrypt: 1588 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1589%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1590 sub $48, %r13 1591 jmp _initial_blocks_encrypted 1592_initial_num_blocks_is_2_encrypt: 1593 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1594%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1595 sub $32, %r13 1596 jmp _initial_blocks_encrypted 1597_initial_num_blocks_is_1_encrypt: 1598 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1599%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1600 sub $16, %r13 1601 jmp _initial_blocks_encrypted 1602_initial_num_blocks_is_0_encrypt: 1603 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1604%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1605_initial_blocks_encrypted: 1606 1607 # Main loop - Encrypt remaining blocks 1608 1609 cmp $0, %r13 1610 je _zero_cipher_left_encrypt 1611 sub $64, %r13 1612 je _four_cipher_left_encrypt 1613_encrypt_by_4_encrypt: 1614 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1615%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1616 add $64, %r11 1617 sub $64, %r13 1618 jne _encrypt_by_4_encrypt 1619_four_cipher_left_encrypt: 1620 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1621%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1622_zero_cipher_left_encrypt: 1623 mov %arg4, %r13 1624 and $15, %r13 # %r13 = arg4 (mod 16) 1625 je _multiple_of_16_bytes_encrypt 1626 1627 # Handle the last <16 Byte block separately 1628 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1629 movdqa SHUF_MASK(%rip), %xmm10 1630 PSHUFB_XMM %xmm10, %xmm0 1631 1632 1633 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1634 sub $16, %r11 1635 add %r13, %r11 1636 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1637 lea SHIFT_MASK+16(%rip), %r12 1638 sub %r13, %r12 1639 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1640 # (%r13 is the number of bytes in plaintext mod 16) 1641 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1642 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1643 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1644 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1645 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1646 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1647 movdqa SHUF_MASK(%rip), %xmm10 1648 PSHUFB_XMM %xmm10,%xmm0 1649 1650 pxor %xmm0, %xmm8 1651 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1652 # GHASH computation for the last <16 byte block 1653 sub %r13, %r11 1654 add $16, %r11 1655 1656 movdqa SHUF_MASK(%rip), %xmm10 1657 PSHUFB_XMM %xmm10, %xmm0 1658 1659 # shuffle xmm0 back to output as ciphertext 1660 1661 # Output %r13 bytes 1662 MOVQ_R64_XMM %xmm0, %rax 1663 cmp $8, %r13 1664 jle _less_than_8_bytes_left_encrypt 1665 mov %rax, (%arg2 , %r11, 1) 1666 add $8, %r11 1667 psrldq $8, %xmm0 1668 MOVQ_R64_XMM %xmm0, %rax 1669 sub $8, %r13 1670_less_than_8_bytes_left_encrypt: 1671 mov %al, (%arg2, %r11, 1) 1672 add $1, %r11 1673 shr $8, %rax 1674 sub $1, %r13 1675 jne _less_than_8_bytes_left_encrypt 1676_multiple_of_16_bytes_encrypt: 1677 mov arg8, %r12 # %r12 = addLen (number of bytes) 1678 shl $3, %r12 1679 movd %r12d, %xmm15 # len(A) in %xmm15 1680 shl $3, %arg4 # len(C) in bits (*128) 1681 MOVQ_R64_XMM %arg4, %xmm1 1682 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1683 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1684 pxor %xmm15, %xmm8 1685 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1686 # final GHASH computation 1687 movdqa SHUF_MASK(%rip), %xmm10 1688 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1689 1690 mov %arg5, %rax # %rax = *Y0 1691 movdqu (%rax), %xmm0 # %xmm0 = Y0 1692 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1693 pxor %xmm8, %xmm0 1694_return_T_encrypt: 1695 mov arg9, %r10 # %r10 = authTag 1696 mov arg10, %r11 # %r11 = auth_tag_len 1697 cmp $16, %r11 1698 je _T_16_encrypt 1699 cmp $12, %r11 1700 je _T_12_encrypt 1701_T_8_encrypt: 1702 MOVQ_R64_XMM %xmm0, %rax 1703 mov %rax, (%r10) 1704 jmp _return_T_done_encrypt 1705_T_12_encrypt: 1706 MOVQ_R64_XMM %xmm0, %rax 1707 mov %rax, (%r10) 1708 psrldq $8, %xmm0 1709 movd %xmm0, %eax 1710 mov %eax, 8(%r10) 1711 jmp _return_T_done_encrypt 1712_T_16_encrypt: 1713 movdqu %xmm0, (%r10) 1714_return_T_done_encrypt: 1715 mov %r14, %rsp 1716 pop %r14 1717 pop %r13 1718 pop %r12 1719 ret 1720ENDPROC(aesni_gcm_enc) 1721 1722#endif 1723 1724 1725.align 4 1726_key_expansion_128: 1727_key_expansion_256a: 1728 pshufd $0b11111111, %xmm1, %xmm1 1729 shufps $0b00010000, %xmm0, %xmm4 1730 pxor %xmm4, %xmm0 1731 shufps $0b10001100, %xmm0, %xmm4 1732 pxor %xmm4, %xmm0 1733 pxor %xmm1, %xmm0 1734 movaps %xmm0, (TKEYP) 1735 add $0x10, TKEYP 1736 ret 1737ENDPROC(_key_expansion_128) 1738ENDPROC(_key_expansion_256a) 1739 1740.align 4 1741_key_expansion_192a: 1742 pshufd $0b01010101, %xmm1, %xmm1 1743 shufps $0b00010000, %xmm0, %xmm4 1744 pxor %xmm4, %xmm0 1745 shufps $0b10001100, %xmm0, %xmm4 1746 pxor %xmm4, %xmm0 1747 pxor %xmm1, %xmm0 1748 1749 movaps %xmm2, %xmm5 1750 movaps %xmm2, %xmm6 1751 pslldq $4, %xmm5 1752 pshufd $0b11111111, %xmm0, %xmm3 1753 pxor %xmm3, %xmm2 1754 pxor %xmm5, %xmm2 1755 1756 movaps %xmm0, %xmm1 1757 shufps $0b01000100, %xmm0, %xmm6 1758 movaps %xmm6, (TKEYP) 1759 shufps $0b01001110, %xmm2, %xmm1 1760 movaps %xmm1, 0x10(TKEYP) 1761 add $0x20, TKEYP 1762 ret 1763ENDPROC(_key_expansion_192a) 1764 1765.align 4 1766_key_expansion_192b: 1767 pshufd $0b01010101, %xmm1, %xmm1 1768 shufps $0b00010000, %xmm0, %xmm4 1769 pxor %xmm4, %xmm0 1770 shufps $0b10001100, %xmm0, %xmm4 1771 pxor %xmm4, %xmm0 1772 pxor %xmm1, %xmm0 1773 1774 movaps %xmm2, %xmm5 1775 pslldq $4, %xmm5 1776 pshufd $0b11111111, %xmm0, %xmm3 1777 pxor %xmm3, %xmm2 1778 pxor %xmm5, %xmm2 1779 1780 movaps %xmm0, (TKEYP) 1781 add $0x10, TKEYP 1782 ret 1783ENDPROC(_key_expansion_192b) 1784 1785.align 4 1786_key_expansion_256b: 1787 pshufd $0b10101010, %xmm1, %xmm1 1788 shufps $0b00010000, %xmm2, %xmm4 1789 pxor %xmm4, %xmm2 1790 shufps $0b10001100, %xmm2, %xmm4 1791 pxor %xmm4, %xmm2 1792 pxor %xmm1, %xmm2 1793 movaps %xmm2, (TKEYP) 1794 add $0x10, TKEYP 1795 ret 1796ENDPROC(_key_expansion_256b) 1797 1798/* 1799 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1800 * unsigned int key_len) 1801 */ 1802ENTRY(aesni_set_key) 1803#ifndef __x86_64__ 1804 pushl KEYP 1805 movl 8(%esp), KEYP # ctx 1806 movl 12(%esp), UKEYP # in_key 1807 movl 16(%esp), %edx # key_len 1808#endif 1809 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1810 movaps %xmm0, (KEYP) 1811 lea 0x10(KEYP), TKEYP # key addr 1812 movl %edx, 480(KEYP) 1813 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1814 cmp $24, %dl 1815 jb .Lenc_key128 1816 je .Lenc_key192 1817 movups 0x10(UKEYP), %xmm2 # other user key 1818 movaps %xmm2, (TKEYP) 1819 add $0x10, TKEYP 1820 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1821 call _key_expansion_256a 1822 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1823 call _key_expansion_256b 1824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1825 call _key_expansion_256a 1826 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1827 call _key_expansion_256b 1828 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1829 call _key_expansion_256a 1830 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1831 call _key_expansion_256b 1832 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1833 call _key_expansion_256a 1834 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1835 call _key_expansion_256b 1836 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1837 call _key_expansion_256a 1838 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1839 call _key_expansion_256b 1840 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1841 call _key_expansion_256a 1842 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1843 call _key_expansion_256b 1844 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1845 call _key_expansion_256a 1846 jmp .Ldec_key 1847.Lenc_key192: 1848 movq 0x10(UKEYP), %xmm2 # other user key 1849 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1850 call _key_expansion_192a 1851 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1852 call _key_expansion_192b 1853 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1854 call _key_expansion_192a 1855 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1856 call _key_expansion_192b 1857 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1858 call _key_expansion_192a 1859 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1860 call _key_expansion_192b 1861 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1862 call _key_expansion_192a 1863 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1864 call _key_expansion_192b 1865 jmp .Ldec_key 1866.Lenc_key128: 1867 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1868 call _key_expansion_128 1869 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1870 call _key_expansion_128 1871 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1872 call _key_expansion_128 1873 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1874 call _key_expansion_128 1875 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1876 call _key_expansion_128 1877 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1878 call _key_expansion_128 1879 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1880 call _key_expansion_128 1881 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1882 call _key_expansion_128 1883 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1884 call _key_expansion_128 1885 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1886 call _key_expansion_128 1887.Ldec_key: 1888 sub $0x10, TKEYP 1889 movaps (KEYP), %xmm0 1890 movaps (TKEYP), %xmm1 1891 movaps %xmm0, 240(TKEYP) 1892 movaps %xmm1, 240(KEYP) 1893 add $0x10, KEYP 1894 lea 240-16(TKEYP), UKEYP 1895.align 4 1896.Ldec_key_loop: 1897 movaps (KEYP), %xmm0 1898 AESIMC %xmm0 %xmm1 1899 movaps %xmm1, (UKEYP) 1900 add $0x10, KEYP 1901 sub $0x10, UKEYP 1902 cmp TKEYP, KEYP 1903 jb .Ldec_key_loop 1904 xor AREG, AREG 1905#ifndef __x86_64__ 1906 popl KEYP 1907#endif 1908 ret 1909ENDPROC(aesni_set_key) 1910 1911/* 1912 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1913 */ 1914ENTRY(aesni_enc) 1915#ifndef __x86_64__ 1916 pushl KEYP 1917 pushl KLEN 1918 movl 12(%esp), KEYP 1919 movl 16(%esp), OUTP 1920 movl 20(%esp), INP 1921#endif 1922 movl 480(KEYP), KLEN # key length 1923 movups (INP), STATE # input 1924 call _aesni_enc1 1925 movups STATE, (OUTP) # output 1926#ifndef __x86_64__ 1927 popl KLEN 1928 popl KEYP 1929#endif 1930 ret 1931ENDPROC(aesni_enc) 1932 1933/* 1934 * _aesni_enc1: internal ABI 1935 * input: 1936 * KEYP: key struct pointer 1937 * KLEN: round count 1938 * STATE: initial state (input) 1939 * output: 1940 * STATE: finial state (output) 1941 * changed: 1942 * KEY 1943 * TKEYP (T1) 1944 */ 1945.align 4 1946_aesni_enc1: 1947 movaps (KEYP), KEY # key 1948 mov KEYP, TKEYP 1949 pxor KEY, STATE # round 0 1950 add $0x30, TKEYP 1951 cmp $24, KLEN 1952 jb .Lenc128 1953 lea 0x20(TKEYP), TKEYP 1954 je .Lenc192 1955 add $0x20, TKEYP 1956 movaps -0x60(TKEYP), KEY 1957 AESENC KEY STATE 1958 movaps -0x50(TKEYP), KEY 1959 AESENC KEY STATE 1960.align 4 1961.Lenc192: 1962 movaps -0x40(TKEYP), KEY 1963 AESENC KEY STATE 1964 movaps -0x30(TKEYP), KEY 1965 AESENC KEY STATE 1966.align 4 1967.Lenc128: 1968 movaps -0x20(TKEYP), KEY 1969 AESENC KEY STATE 1970 movaps -0x10(TKEYP), KEY 1971 AESENC KEY STATE 1972 movaps (TKEYP), KEY 1973 AESENC KEY STATE 1974 movaps 0x10(TKEYP), KEY 1975 AESENC KEY STATE 1976 movaps 0x20(TKEYP), KEY 1977 AESENC KEY STATE 1978 movaps 0x30(TKEYP), KEY 1979 AESENC KEY STATE 1980 movaps 0x40(TKEYP), KEY 1981 AESENC KEY STATE 1982 movaps 0x50(TKEYP), KEY 1983 AESENC KEY STATE 1984 movaps 0x60(TKEYP), KEY 1985 AESENC KEY STATE 1986 movaps 0x70(TKEYP), KEY 1987 AESENCLAST KEY STATE 1988 ret 1989ENDPROC(_aesni_enc1) 1990 1991/* 1992 * _aesni_enc4: internal ABI 1993 * input: 1994 * KEYP: key struct pointer 1995 * KLEN: round count 1996 * STATE1: initial state (input) 1997 * STATE2 1998 * STATE3 1999 * STATE4 2000 * output: 2001 * STATE1: finial state (output) 2002 * STATE2 2003 * STATE3 2004 * STATE4 2005 * changed: 2006 * KEY 2007 * TKEYP (T1) 2008 */ 2009.align 4 2010_aesni_enc4: 2011 movaps (KEYP), KEY # key 2012 mov KEYP, TKEYP 2013 pxor KEY, STATE1 # round 0 2014 pxor KEY, STATE2 2015 pxor KEY, STATE3 2016 pxor KEY, STATE4 2017 add $0x30, TKEYP 2018 cmp $24, KLEN 2019 jb .L4enc128 2020 lea 0x20(TKEYP), TKEYP 2021 je .L4enc192 2022 add $0x20, TKEYP 2023 movaps -0x60(TKEYP), KEY 2024 AESENC KEY STATE1 2025 AESENC KEY STATE2 2026 AESENC KEY STATE3 2027 AESENC KEY STATE4 2028 movaps -0x50(TKEYP), KEY 2029 AESENC KEY STATE1 2030 AESENC KEY STATE2 2031 AESENC KEY STATE3 2032 AESENC KEY STATE4 2033#.align 4 2034.L4enc192: 2035 movaps -0x40(TKEYP), KEY 2036 AESENC KEY STATE1 2037 AESENC KEY STATE2 2038 AESENC KEY STATE3 2039 AESENC KEY STATE4 2040 movaps -0x30(TKEYP), KEY 2041 AESENC KEY STATE1 2042 AESENC KEY STATE2 2043 AESENC KEY STATE3 2044 AESENC KEY STATE4 2045#.align 4 2046.L4enc128: 2047 movaps -0x20(TKEYP), KEY 2048 AESENC KEY STATE1 2049 AESENC KEY STATE2 2050 AESENC KEY STATE3 2051 AESENC KEY STATE4 2052 movaps -0x10(TKEYP), KEY 2053 AESENC KEY STATE1 2054 AESENC KEY STATE2 2055 AESENC KEY STATE3 2056 AESENC KEY STATE4 2057 movaps (TKEYP), KEY 2058 AESENC KEY STATE1 2059 AESENC KEY STATE2 2060 AESENC KEY STATE3 2061 AESENC KEY STATE4 2062 movaps 0x10(TKEYP), KEY 2063 AESENC KEY STATE1 2064 AESENC KEY STATE2 2065 AESENC KEY STATE3 2066 AESENC KEY STATE4 2067 movaps 0x20(TKEYP), KEY 2068 AESENC KEY STATE1 2069 AESENC KEY STATE2 2070 AESENC KEY STATE3 2071 AESENC KEY STATE4 2072 movaps 0x30(TKEYP), KEY 2073 AESENC KEY STATE1 2074 AESENC KEY STATE2 2075 AESENC KEY STATE3 2076 AESENC KEY STATE4 2077 movaps 0x40(TKEYP), KEY 2078 AESENC KEY STATE1 2079 AESENC KEY STATE2 2080 AESENC KEY STATE3 2081 AESENC KEY STATE4 2082 movaps 0x50(TKEYP), KEY 2083 AESENC KEY STATE1 2084 AESENC KEY STATE2 2085 AESENC KEY STATE3 2086 AESENC KEY STATE4 2087 movaps 0x60(TKEYP), KEY 2088 AESENC KEY STATE1 2089 AESENC KEY STATE2 2090 AESENC KEY STATE3 2091 AESENC KEY STATE4 2092 movaps 0x70(TKEYP), KEY 2093 AESENCLAST KEY STATE1 # last round 2094 AESENCLAST KEY STATE2 2095 AESENCLAST KEY STATE3 2096 AESENCLAST KEY STATE4 2097 ret 2098ENDPROC(_aesni_enc4) 2099 2100/* 2101 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2102 */ 2103ENTRY(aesni_dec) 2104#ifndef __x86_64__ 2105 pushl KEYP 2106 pushl KLEN 2107 movl 12(%esp), KEYP 2108 movl 16(%esp), OUTP 2109 movl 20(%esp), INP 2110#endif 2111 mov 480(KEYP), KLEN # key length 2112 add $240, KEYP 2113 movups (INP), STATE # input 2114 call _aesni_dec1 2115 movups STATE, (OUTP) #output 2116#ifndef __x86_64__ 2117 popl KLEN 2118 popl KEYP 2119#endif 2120 ret 2121ENDPROC(aesni_dec) 2122 2123/* 2124 * _aesni_dec1: internal ABI 2125 * input: 2126 * KEYP: key struct pointer 2127 * KLEN: key length 2128 * STATE: initial state (input) 2129 * output: 2130 * STATE: finial state (output) 2131 * changed: 2132 * KEY 2133 * TKEYP (T1) 2134 */ 2135.align 4 2136_aesni_dec1: 2137 movaps (KEYP), KEY # key 2138 mov KEYP, TKEYP 2139 pxor KEY, STATE # round 0 2140 add $0x30, TKEYP 2141 cmp $24, KLEN 2142 jb .Ldec128 2143 lea 0x20(TKEYP), TKEYP 2144 je .Ldec192 2145 add $0x20, TKEYP 2146 movaps -0x60(TKEYP), KEY 2147 AESDEC KEY STATE 2148 movaps -0x50(TKEYP), KEY 2149 AESDEC KEY STATE 2150.align 4 2151.Ldec192: 2152 movaps -0x40(TKEYP), KEY 2153 AESDEC KEY STATE 2154 movaps -0x30(TKEYP), KEY 2155 AESDEC KEY STATE 2156.align 4 2157.Ldec128: 2158 movaps -0x20(TKEYP), KEY 2159 AESDEC KEY STATE 2160 movaps -0x10(TKEYP), KEY 2161 AESDEC KEY STATE 2162 movaps (TKEYP), KEY 2163 AESDEC KEY STATE 2164 movaps 0x10(TKEYP), KEY 2165 AESDEC KEY STATE 2166 movaps 0x20(TKEYP), KEY 2167 AESDEC KEY STATE 2168 movaps 0x30(TKEYP), KEY 2169 AESDEC KEY STATE 2170 movaps 0x40(TKEYP), KEY 2171 AESDEC KEY STATE 2172 movaps 0x50(TKEYP), KEY 2173 AESDEC KEY STATE 2174 movaps 0x60(TKEYP), KEY 2175 AESDEC KEY STATE 2176 movaps 0x70(TKEYP), KEY 2177 AESDECLAST KEY STATE 2178 ret 2179ENDPROC(_aesni_dec1) 2180 2181/* 2182 * _aesni_dec4: internal ABI 2183 * input: 2184 * KEYP: key struct pointer 2185 * KLEN: key length 2186 * STATE1: initial state (input) 2187 * STATE2 2188 * STATE3 2189 * STATE4 2190 * output: 2191 * STATE1: finial state (output) 2192 * STATE2 2193 * STATE3 2194 * STATE4 2195 * changed: 2196 * KEY 2197 * TKEYP (T1) 2198 */ 2199.align 4 2200_aesni_dec4: 2201 movaps (KEYP), KEY # key 2202 mov KEYP, TKEYP 2203 pxor KEY, STATE1 # round 0 2204 pxor KEY, STATE2 2205 pxor KEY, STATE3 2206 pxor KEY, STATE4 2207 add $0x30, TKEYP 2208 cmp $24, KLEN 2209 jb .L4dec128 2210 lea 0x20(TKEYP), TKEYP 2211 je .L4dec192 2212 add $0x20, TKEYP 2213 movaps -0x60(TKEYP), KEY 2214 AESDEC KEY STATE1 2215 AESDEC KEY STATE2 2216 AESDEC KEY STATE3 2217 AESDEC KEY STATE4 2218 movaps -0x50(TKEYP), KEY 2219 AESDEC KEY STATE1 2220 AESDEC KEY STATE2 2221 AESDEC KEY STATE3 2222 AESDEC KEY STATE4 2223.align 4 2224.L4dec192: 2225 movaps -0x40(TKEYP), KEY 2226 AESDEC KEY STATE1 2227 AESDEC KEY STATE2 2228 AESDEC KEY STATE3 2229 AESDEC KEY STATE4 2230 movaps -0x30(TKEYP), KEY 2231 AESDEC KEY STATE1 2232 AESDEC KEY STATE2 2233 AESDEC KEY STATE3 2234 AESDEC KEY STATE4 2235.align 4 2236.L4dec128: 2237 movaps -0x20(TKEYP), KEY 2238 AESDEC KEY STATE1 2239 AESDEC KEY STATE2 2240 AESDEC KEY STATE3 2241 AESDEC KEY STATE4 2242 movaps -0x10(TKEYP), KEY 2243 AESDEC KEY STATE1 2244 AESDEC KEY STATE2 2245 AESDEC KEY STATE3 2246 AESDEC KEY STATE4 2247 movaps (TKEYP), KEY 2248 AESDEC KEY STATE1 2249 AESDEC KEY STATE2 2250 AESDEC KEY STATE3 2251 AESDEC KEY STATE4 2252 movaps 0x10(TKEYP), KEY 2253 AESDEC KEY STATE1 2254 AESDEC KEY STATE2 2255 AESDEC KEY STATE3 2256 AESDEC KEY STATE4 2257 movaps 0x20(TKEYP), KEY 2258 AESDEC KEY STATE1 2259 AESDEC KEY STATE2 2260 AESDEC KEY STATE3 2261 AESDEC KEY STATE4 2262 movaps 0x30(TKEYP), KEY 2263 AESDEC KEY STATE1 2264 AESDEC KEY STATE2 2265 AESDEC KEY STATE3 2266 AESDEC KEY STATE4 2267 movaps 0x40(TKEYP), KEY 2268 AESDEC KEY STATE1 2269 AESDEC KEY STATE2 2270 AESDEC KEY STATE3 2271 AESDEC KEY STATE4 2272 movaps 0x50(TKEYP), KEY 2273 AESDEC KEY STATE1 2274 AESDEC KEY STATE2 2275 AESDEC KEY STATE3 2276 AESDEC KEY STATE4 2277 movaps 0x60(TKEYP), KEY 2278 AESDEC KEY STATE1 2279 AESDEC KEY STATE2 2280 AESDEC KEY STATE3 2281 AESDEC KEY STATE4 2282 movaps 0x70(TKEYP), KEY 2283 AESDECLAST KEY STATE1 # last round 2284 AESDECLAST KEY STATE2 2285 AESDECLAST KEY STATE3 2286 AESDECLAST KEY STATE4 2287 ret 2288ENDPROC(_aesni_dec4) 2289 2290/* 2291 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2292 * size_t len) 2293 */ 2294ENTRY(aesni_ecb_enc) 2295#ifndef __x86_64__ 2296 pushl LEN 2297 pushl KEYP 2298 pushl KLEN 2299 movl 16(%esp), KEYP 2300 movl 20(%esp), OUTP 2301 movl 24(%esp), INP 2302 movl 28(%esp), LEN 2303#endif 2304 test LEN, LEN # check length 2305 jz .Lecb_enc_ret 2306 mov 480(KEYP), KLEN 2307 cmp $16, LEN 2308 jb .Lecb_enc_ret 2309 cmp $64, LEN 2310 jb .Lecb_enc_loop1 2311.align 4 2312.Lecb_enc_loop4: 2313 movups (INP), STATE1 2314 movups 0x10(INP), STATE2 2315 movups 0x20(INP), STATE3 2316 movups 0x30(INP), STATE4 2317 call _aesni_enc4 2318 movups STATE1, (OUTP) 2319 movups STATE2, 0x10(OUTP) 2320 movups STATE3, 0x20(OUTP) 2321 movups STATE4, 0x30(OUTP) 2322 sub $64, LEN 2323 add $64, INP 2324 add $64, OUTP 2325 cmp $64, LEN 2326 jge .Lecb_enc_loop4 2327 cmp $16, LEN 2328 jb .Lecb_enc_ret 2329.align 4 2330.Lecb_enc_loop1: 2331 movups (INP), STATE1 2332 call _aesni_enc1 2333 movups STATE1, (OUTP) 2334 sub $16, LEN 2335 add $16, INP 2336 add $16, OUTP 2337 cmp $16, LEN 2338 jge .Lecb_enc_loop1 2339.Lecb_enc_ret: 2340#ifndef __x86_64__ 2341 popl KLEN 2342 popl KEYP 2343 popl LEN 2344#endif 2345 ret 2346ENDPROC(aesni_ecb_enc) 2347 2348/* 2349 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2350 * size_t len); 2351 */ 2352ENTRY(aesni_ecb_dec) 2353#ifndef __x86_64__ 2354 pushl LEN 2355 pushl KEYP 2356 pushl KLEN 2357 movl 16(%esp), KEYP 2358 movl 20(%esp), OUTP 2359 movl 24(%esp), INP 2360 movl 28(%esp), LEN 2361#endif 2362 test LEN, LEN 2363 jz .Lecb_dec_ret 2364 mov 480(KEYP), KLEN 2365 add $240, KEYP 2366 cmp $16, LEN 2367 jb .Lecb_dec_ret 2368 cmp $64, LEN 2369 jb .Lecb_dec_loop1 2370.align 4 2371.Lecb_dec_loop4: 2372 movups (INP), STATE1 2373 movups 0x10(INP), STATE2 2374 movups 0x20(INP), STATE3 2375 movups 0x30(INP), STATE4 2376 call _aesni_dec4 2377 movups STATE1, (OUTP) 2378 movups STATE2, 0x10(OUTP) 2379 movups STATE3, 0x20(OUTP) 2380 movups STATE4, 0x30(OUTP) 2381 sub $64, LEN 2382 add $64, INP 2383 add $64, OUTP 2384 cmp $64, LEN 2385 jge .Lecb_dec_loop4 2386 cmp $16, LEN 2387 jb .Lecb_dec_ret 2388.align 4 2389.Lecb_dec_loop1: 2390 movups (INP), STATE1 2391 call _aesni_dec1 2392 movups STATE1, (OUTP) 2393 sub $16, LEN 2394 add $16, INP 2395 add $16, OUTP 2396 cmp $16, LEN 2397 jge .Lecb_dec_loop1 2398.Lecb_dec_ret: 2399#ifndef __x86_64__ 2400 popl KLEN 2401 popl KEYP 2402 popl LEN 2403#endif 2404 ret 2405ENDPROC(aesni_ecb_dec) 2406 2407/* 2408 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2409 * size_t len, u8 *iv) 2410 */ 2411ENTRY(aesni_cbc_enc) 2412#ifndef __x86_64__ 2413 pushl IVP 2414 pushl LEN 2415 pushl KEYP 2416 pushl KLEN 2417 movl 20(%esp), KEYP 2418 movl 24(%esp), OUTP 2419 movl 28(%esp), INP 2420 movl 32(%esp), LEN 2421 movl 36(%esp), IVP 2422#endif 2423 cmp $16, LEN 2424 jb .Lcbc_enc_ret 2425 mov 480(KEYP), KLEN 2426 movups (IVP), STATE # load iv as initial state 2427.align 4 2428.Lcbc_enc_loop: 2429 movups (INP), IN # load input 2430 pxor IN, STATE 2431 call _aesni_enc1 2432 movups STATE, (OUTP) # store output 2433 sub $16, LEN 2434 add $16, INP 2435 add $16, OUTP 2436 cmp $16, LEN 2437 jge .Lcbc_enc_loop 2438 movups STATE, (IVP) 2439.Lcbc_enc_ret: 2440#ifndef __x86_64__ 2441 popl KLEN 2442 popl KEYP 2443 popl LEN 2444 popl IVP 2445#endif 2446 ret 2447ENDPROC(aesni_cbc_enc) 2448 2449/* 2450 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2451 * size_t len, u8 *iv) 2452 */ 2453ENTRY(aesni_cbc_dec) 2454#ifndef __x86_64__ 2455 pushl IVP 2456 pushl LEN 2457 pushl KEYP 2458 pushl KLEN 2459 movl 20(%esp), KEYP 2460 movl 24(%esp), OUTP 2461 movl 28(%esp), INP 2462 movl 32(%esp), LEN 2463 movl 36(%esp), IVP 2464#endif 2465 cmp $16, LEN 2466 jb .Lcbc_dec_just_ret 2467 mov 480(KEYP), KLEN 2468 add $240, KEYP 2469 movups (IVP), IV 2470 cmp $64, LEN 2471 jb .Lcbc_dec_loop1 2472.align 4 2473.Lcbc_dec_loop4: 2474 movups (INP), IN1 2475 movaps IN1, STATE1 2476 movups 0x10(INP), IN2 2477 movaps IN2, STATE2 2478#ifdef __x86_64__ 2479 movups 0x20(INP), IN3 2480 movaps IN3, STATE3 2481 movups 0x30(INP), IN4 2482 movaps IN4, STATE4 2483#else 2484 movups 0x20(INP), IN1 2485 movaps IN1, STATE3 2486 movups 0x30(INP), IN2 2487 movaps IN2, STATE4 2488#endif 2489 call _aesni_dec4 2490 pxor IV, STATE1 2491#ifdef __x86_64__ 2492 pxor IN1, STATE2 2493 pxor IN2, STATE3 2494 pxor IN3, STATE4 2495 movaps IN4, IV 2496#else 2497 pxor IN1, STATE4 2498 movaps IN2, IV 2499 movups (INP), IN1 2500 pxor IN1, STATE2 2501 movups 0x10(INP), IN2 2502 pxor IN2, STATE3 2503#endif 2504 movups STATE1, (OUTP) 2505 movups STATE2, 0x10(OUTP) 2506 movups STATE3, 0x20(OUTP) 2507 movups STATE4, 0x30(OUTP) 2508 sub $64, LEN 2509 add $64, INP 2510 add $64, OUTP 2511 cmp $64, LEN 2512 jge .Lcbc_dec_loop4 2513 cmp $16, LEN 2514 jb .Lcbc_dec_ret 2515.align 4 2516.Lcbc_dec_loop1: 2517 movups (INP), IN 2518 movaps IN, STATE 2519 call _aesni_dec1 2520 pxor IV, STATE 2521 movups STATE, (OUTP) 2522 movaps IN, IV 2523 sub $16, LEN 2524 add $16, INP 2525 add $16, OUTP 2526 cmp $16, LEN 2527 jge .Lcbc_dec_loop1 2528.Lcbc_dec_ret: 2529 movups IV, (IVP) 2530.Lcbc_dec_just_ret: 2531#ifndef __x86_64__ 2532 popl KLEN 2533 popl KEYP 2534 popl LEN 2535 popl IVP 2536#endif 2537 ret 2538ENDPROC(aesni_cbc_dec) 2539 2540#ifdef __x86_64__ 2541.align 16 2542.Lbswap_mask: 2543 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2544 2545/* 2546 * _aesni_inc_init: internal ABI 2547 * setup registers used by _aesni_inc 2548 * input: 2549 * IV 2550 * output: 2551 * CTR: == IV, in little endian 2552 * TCTR_LOW: == lower qword of CTR 2553 * INC: == 1, in little endian 2554 * BSWAP_MASK == endian swapping mask 2555 */ 2556.align 4 2557_aesni_inc_init: 2558 movaps .Lbswap_mask, BSWAP_MASK 2559 movaps IV, CTR 2560 PSHUFB_XMM BSWAP_MASK CTR 2561 mov $1, TCTR_LOW 2562 MOVQ_R64_XMM TCTR_LOW INC 2563 MOVQ_R64_XMM CTR TCTR_LOW 2564 ret 2565ENDPROC(_aesni_inc_init) 2566 2567/* 2568 * _aesni_inc: internal ABI 2569 * Increase IV by 1, IV is in big endian 2570 * input: 2571 * IV 2572 * CTR: == IV, in little endian 2573 * TCTR_LOW: == lower qword of CTR 2574 * INC: == 1, in little endian 2575 * BSWAP_MASK == endian swapping mask 2576 * output: 2577 * IV: Increase by 1 2578 * changed: 2579 * CTR: == output IV, in little endian 2580 * TCTR_LOW: == lower qword of CTR 2581 */ 2582.align 4 2583_aesni_inc: 2584 paddq INC, CTR 2585 add $1, TCTR_LOW 2586 jnc .Linc_low 2587 pslldq $8, INC 2588 paddq INC, CTR 2589 psrldq $8, INC 2590.Linc_low: 2591 movaps CTR, IV 2592 PSHUFB_XMM BSWAP_MASK IV 2593 ret 2594ENDPROC(_aesni_inc) 2595 2596/* 2597 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2598 * size_t len, u8 *iv) 2599 */ 2600ENTRY(aesni_ctr_enc) 2601 cmp $16, LEN 2602 jb .Lctr_enc_just_ret 2603 mov 480(KEYP), KLEN 2604 movups (IVP), IV 2605 call _aesni_inc_init 2606 cmp $64, LEN 2607 jb .Lctr_enc_loop1 2608.align 4 2609.Lctr_enc_loop4: 2610 movaps IV, STATE1 2611 call _aesni_inc 2612 movups (INP), IN1 2613 movaps IV, STATE2 2614 call _aesni_inc 2615 movups 0x10(INP), IN2 2616 movaps IV, STATE3 2617 call _aesni_inc 2618 movups 0x20(INP), IN3 2619 movaps IV, STATE4 2620 call _aesni_inc 2621 movups 0x30(INP), IN4 2622 call _aesni_enc4 2623 pxor IN1, STATE1 2624 movups STATE1, (OUTP) 2625 pxor IN2, STATE2 2626 movups STATE2, 0x10(OUTP) 2627 pxor IN3, STATE3 2628 movups STATE3, 0x20(OUTP) 2629 pxor IN4, STATE4 2630 movups STATE4, 0x30(OUTP) 2631 sub $64, LEN 2632 add $64, INP 2633 add $64, OUTP 2634 cmp $64, LEN 2635 jge .Lctr_enc_loop4 2636 cmp $16, LEN 2637 jb .Lctr_enc_ret 2638.align 4 2639.Lctr_enc_loop1: 2640 movaps IV, STATE 2641 call _aesni_inc 2642 movups (INP), IN 2643 call _aesni_enc1 2644 pxor IN, STATE 2645 movups STATE, (OUTP) 2646 sub $16, LEN 2647 add $16, INP 2648 add $16, OUTP 2649 cmp $16, LEN 2650 jge .Lctr_enc_loop1 2651.Lctr_enc_ret: 2652 movups IV, (IVP) 2653.Lctr_enc_just_ret: 2654 ret 2655ENDPROC(aesni_ctr_enc) 2656 2657/* 2658 * _aesni_gf128mul_x_ble: internal ABI 2659 * Multiply in GF(2^128) for XTS IVs 2660 * input: 2661 * IV: current IV 2662 * GF128MUL_MASK == mask with 0x87 and 0x01 2663 * output: 2664 * IV: next IV 2665 * changed: 2666 * CTR: == temporary value 2667 */ 2668#define _aesni_gf128mul_x_ble() \ 2669 pshufd $0x13, IV, CTR; \ 2670 paddq IV, IV; \ 2671 psrad $31, CTR; \ 2672 pand GF128MUL_MASK, CTR; \ 2673 pxor CTR, IV; 2674 2675/* 2676 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2677 * bool enc, u8 *iv) 2678 */ 2679ENTRY(aesni_xts_crypt8) 2680 cmpb $0, %cl 2681 movl $0, %ecx 2682 movl $240, %r10d 2683 leaq _aesni_enc4, %r11 2684 leaq _aesni_dec4, %rax 2685 cmovel %r10d, %ecx 2686 cmoveq %rax, %r11 2687 2688 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2689 movups (IVP), IV 2690 2691 mov 480(KEYP), KLEN 2692 addq %rcx, KEYP 2693 2694 movdqa IV, STATE1 2695 movdqu 0x00(INP), INC 2696 pxor INC, STATE1 2697 movdqu IV, 0x00(OUTP) 2698 2699 _aesni_gf128mul_x_ble() 2700 movdqa IV, STATE2 2701 movdqu 0x10(INP), INC 2702 pxor INC, STATE2 2703 movdqu IV, 0x10(OUTP) 2704 2705 _aesni_gf128mul_x_ble() 2706 movdqa IV, STATE3 2707 movdqu 0x20(INP), INC 2708 pxor INC, STATE3 2709 movdqu IV, 0x20(OUTP) 2710 2711 _aesni_gf128mul_x_ble() 2712 movdqa IV, STATE4 2713 movdqu 0x30(INP), INC 2714 pxor INC, STATE4 2715 movdqu IV, 0x30(OUTP) 2716 2717 call *%r11 2718 2719 movdqu 0x00(OUTP), INC 2720 pxor INC, STATE1 2721 movdqu STATE1, 0x00(OUTP) 2722 2723 _aesni_gf128mul_x_ble() 2724 movdqa IV, STATE1 2725 movdqu 0x40(INP), INC 2726 pxor INC, STATE1 2727 movdqu IV, 0x40(OUTP) 2728 2729 movdqu 0x10(OUTP), INC 2730 pxor INC, STATE2 2731 movdqu STATE2, 0x10(OUTP) 2732 2733 _aesni_gf128mul_x_ble() 2734 movdqa IV, STATE2 2735 movdqu 0x50(INP), INC 2736 pxor INC, STATE2 2737 movdqu IV, 0x50(OUTP) 2738 2739 movdqu 0x20(OUTP), INC 2740 pxor INC, STATE3 2741 movdqu STATE3, 0x20(OUTP) 2742 2743 _aesni_gf128mul_x_ble() 2744 movdqa IV, STATE3 2745 movdqu 0x60(INP), INC 2746 pxor INC, STATE3 2747 movdqu IV, 0x60(OUTP) 2748 2749 movdqu 0x30(OUTP), INC 2750 pxor INC, STATE4 2751 movdqu STATE4, 0x30(OUTP) 2752 2753 _aesni_gf128mul_x_ble() 2754 movdqa IV, STATE4 2755 movdqu 0x70(INP), INC 2756 pxor INC, STATE4 2757 movdqu IV, 0x70(OUTP) 2758 2759 _aesni_gf128mul_x_ble() 2760 movups IV, (IVP) 2761 2762 call *%r11 2763 2764 movdqu 0x40(OUTP), INC 2765 pxor INC, STATE1 2766 movdqu STATE1, 0x40(OUTP) 2767 2768 movdqu 0x50(OUTP), INC 2769 pxor INC, STATE2 2770 movdqu STATE2, 0x50(OUTP) 2771 2772 movdqu 0x60(OUTP), INC 2773 pxor INC, STATE3 2774 movdqu STATE3, 0x60(OUTP) 2775 2776 movdqu 0x70(OUTP), INC 2777 pxor INC, STATE4 2778 movdqu STATE4, 0x70(OUTP) 2779 2780 ret 2781ENDPROC(aesni_xts_crypt8) 2782 2783#endif 2784