1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34 35#ifdef __x86_64__ 36.data 37.align 16 38.Lgf128mul_x_ble_mask: 39 .octa 0x00000000000000010000000000000087 40 41POLY: .octa 0xC2000000000000000000000000000001 42TWOONE: .octa 0x00000001000000000000000000000001 43 44# order of these constants should not change. 45# more specifically, ALL_F should follow SHIFT_MASK, 46# and ZERO should follow ALL_F 47 48SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 49MASK1: .octa 0x0000000000000000ffffffffffffffff 50MASK2: .octa 0xffffffffffffffff0000000000000000 51SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 52ALL_F: .octa 0xffffffffffffffffffffffffffffffff 53ZERO: .octa 0x00000000000000000000000000000000 54ONE: .octa 0x00000000000000000000000000000001 55F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 56dec: .octa 0x1 57enc: .octa 0x2 58 59 60.text 61 62 63#define STACK_OFFSET 8*3 64#define HashKey 16*0 // store HashKey <<1 mod poly here 65#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 66#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 67#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 68#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 69 // bits of HashKey <<1 mod poly here 70 //(for Karatsuba purposes) 71#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 72 // bits of HashKey^2 <<1 mod poly here 73 // (for Karatsuba purposes) 74#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 75 // bits of HashKey^3 <<1 mod poly here 76 // (for Karatsuba purposes) 77#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 78 // bits of HashKey^4 <<1 mod poly here 79 // (for Karatsuba purposes) 80#define VARIABLE_OFFSET 16*8 81 82#define arg1 rdi 83#define arg2 rsi 84#define arg3 rdx 85#define arg4 rcx 86#define arg5 r8 87#define arg6 r9 88#define arg7 STACK_OFFSET+8(%r14) 89#define arg8 STACK_OFFSET+16(%r14) 90#define arg9 STACK_OFFSET+24(%r14) 91#define arg10 STACK_OFFSET+32(%r14) 92#endif 93 94 95#define STATE1 %xmm0 96#define STATE2 %xmm4 97#define STATE3 %xmm5 98#define STATE4 %xmm6 99#define STATE STATE1 100#define IN1 %xmm1 101#define IN2 %xmm7 102#define IN3 %xmm8 103#define IN4 %xmm9 104#define IN IN1 105#define KEY %xmm2 106#define IV %xmm3 107 108#define BSWAP_MASK %xmm10 109#define CTR %xmm11 110#define INC %xmm12 111 112#define GF128MUL_MASK %xmm10 113 114#ifdef __x86_64__ 115#define AREG %rax 116#define KEYP %rdi 117#define OUTP %rsi 118#define UKEYP OUTP 119#define INP %rdx 120#define LEN %rcx 121#define IVP %r8 122#define KLEN %r9d 123#define T1 %r10 124#define TKEYP T1 125#define T2 %r11 126#define TCTR_LOW T2 127#else 128#define AREG %eax 129#define KEYP %edi 130#define OUTP AREG 131#define UKEYP OUTP 132#define INP %edx 133#define LEN %esi 134#define IVP %ebp 135#define KLEN %ebx 136#define T1 %ecx 137#define TKEYP T1 138#endif 139 140 141#ifdef __x86_64__ 142/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 143* 144* 145* Input: A and B (128-bits each, bit-reflected) 146* Output: C = A*B*x mod poly, (i.e. >>1 ) 147* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 148* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 149* 150*/ 151.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 152 movdqa \GH, \TMP1 153 pshufd $78, \GH, \TMP2 154 pshufd $78, \HK, \TMP3 155 pxor \GH, \TMP2 # TMP2 = a1+a0 156 pxor \HK, \TMP3 # TMP3 = b1+b0 157 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 158 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 159 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 160 pxor \GH, \TMP2 161 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 162 movdqa \TMP2, \TMP3 163 pslldq $8, \TMP3 # left shift TMP3 2 DWs 164 psrldq $8, \TMP2 # right shift TMP2 2 DWs 165 pxor \TMP3, \GH 166 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 167 168 # first phase of the reduction 169 170 movdqa \GH, \TMP2 171 movdqa \GH, \TMP3 172 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 173 # in in order to perform 174 # independent shifts 175 pslld $31, \TMP2 # packed right shift <<31 176 pslld $30, \TMP3 # packed right shift <<30 177 pslld $25, \TMP4 # packed right shift <<25 178 pxor \TMP3, \TMP2 # xor the shifted versions 179 pxor \TMP4, \TMP2 180 movdqa \TMP2, \TMP5 181 psrldq $4, \TMP5 # right shift TMP5 1 DW 182 pslldq $12, \TMP2 # left shift TMP2 3 DWs 183 pxor \TMP2, \GH 184 185 # second phase of the reduction 186 187 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 188 # in in order to perform 189 # independent shifts 190 movdqa \GH,\TMP3 191 movdqa \GH,\TMP4 192 psrld $1,\TMP2 # packed left shift >>1 193 psrld $2,\TMP3 # packed left shift >>2 194 psrld $7,\TMP4 # packed left shift >>7 195 pxor \TMP3,\TMP2 # xor the shifted versions 196 pxor \TMP4,\TMP2 197 pxor \TMP5, \TMP2 198 pxor \TMP2, \GH 199 pxor \TMP1, \GH # result is in TMP1 200.endm 201 202/* 203* if a = number of total plaintext bytes 204* b = floor(a/16) 205* num_initial_blocks = b mod 4 206* encrypt the initial num_initial_blocks blocks and apply ghash on 207* the ciphertext 208* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 209* are clobbered 210* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 211*/ 212 213 214.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 215XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 216 mov arg7, %r10 # %r10 = AAD 217 mov arg8, %r12 # %r12 = aadLen 218 mov %r12, %r11 219 pxor %xmm\i, %xmm\i 220_get_AAD_loop\num_initial_blocks\operation: 221 movd (%r10), \TMP1 222 pslldq $12, \TMP1 223 psrldq $4, %xmm\i 224 pxor \TMP1, %xmm\i 225 add $4, %r10 226 sub $4, %r12 227 jne _get_AAD_loop\num_initial_blocks\operation 228 cmp $16, %r11 229 je _get_AAD_loop2_done\num_initial_blocks\operation 230 mov $16, %r12 231_get_AAD_loop2\num_initial_blocks\operation: 232 psrldq $4, %xmm\i 233 sub $4, %r12 234 cmp %r11, %r12 235 jne _get_AAD_loop2\num_initial_blocks\operation 236_get_AAD_loop2_done\num_initial_blocks\operation: 237 movdqa SHUF_MASK(%rip), %xmm14 238 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 239 240 xor %r11, %r11 # initialise the data pointer offset as zero 241 242 # start AES for num_initial_blocks blocks 243 244 mov %arg5, %rax # %rax = *Y0 245 movdqu (%rax), \XMM0 # XMM0 = Y0 246 movdqa SHUF_MASK(%rip), %xmm14 247 PSHUFB_XMM %xmm14, \XMM0 248 249.if (\i == 5) || (\i == 6) || (\i == 7) 250.irpc index, \i_seq 251 paddd ONE(%rip), \XMM0 # INCR Y0 252 movdqa \XMM0, %xmm\index 253 movdqa SHUF_MASK(%rip), %xmm14 254 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 255 256.endr 257.irpc index, \i_seq 258 pxor 16*0(%arg1), %xmm\index 259.endr 260.irpc index, \i_seq 261 movaps 0x10(%rdi), \TMP1 262 AESENC \TMP1, %xmm\index # Round 1 263.endr 264.irpc index, \i_seq 265 movaps 0x20(%arg1), \TMP1 266 AESENC \TMP1, %xmm\index # Round 2 267.endr 268.irpc index, \i_seq 269 movaps 0x30(%arg1), \TMP1 270 AESENC \TMP1, %xmm\index # Round 2 271.endr 272.irpc index, \i_seq 273 movaps 0x40(%arg1), \TMP1 274 AESENC \TMP1, %xmm\index # Round 2 275.endr 276.irpc index, \i_seq 277 movaps 0x50(%arg1), \TMP1 278 AESENC \TMP1, %xmm\index # Round 2 279.endr 280.irpc index, \i_seq 281 movaps 0x60(%arg1), \TMP1 282 AESENC \TMP1, %xmm\index # Round 2 283.endr 284.irpc index, \i_seq 285 movaps 0x70(%arg1), \TMP1 286 AESENC \TMP1, %xmm\index # Round 2 287.endr 288.irpc index, \i_seq 289 movaps 0x80(%arg1), \TMP1 290 AESENC \TMP1, %xmm\index # Round 2 291.endr 292.irpc index, \i_seq 293 movaps 0x90(%arg1), \TMP1 294 AESENC \TMP1, %xmm\index # Round 2 295.endr 296.irpc index, \i_seq 297 movaps 0xa0(%arg1), \TMP1 298 AESENCLAST \TMP1, %xmm\index # Round 10 299.endr 300.irpc index, \i_seq 301 movdqu (%arg3 , %r11, 1), \TMP1 302 pxor \TMP1, %xmm\index 303 movdqu %xmm\index, (%arg2 , %r11, 1) 304 # write back plaintext/ciphertext for num_initial_blocks 305 add $16, %r11 306 307 movdqa \TMP1, %xmm\index 308 movdqa SHUF_MASK(%rip), %xmm14 309 PSHUFB_XMM %xmm14, %xmm\index 310 311 # prepare plaintext/ciphertext for GHASH computation 312.endr 313.endif 314 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 315 # apply GHASH on num_initial_blocks blocks 316 317.if \i == 5 318 pxor %xmm5, %xmm6 319 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 320 pxor %xmm6, %xmm7 321 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 322 pxor %xmm7, %xmm8 323 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 324.elseif \i == 6 325 pxor %xmm6, %xmm7 326 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 327 pxor %xmm7, %xmm8 328 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 329.elseif \i == 7 330 pxor %xmm7, %xmm8 331 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 332.endif 333 cmp $64, %r13 334 jl _initial_blocks_done\num_initial_blocks\operation 335 # no need for precomputed values 336/* 337* 338* Precomputations for HashKey parallel with encryption of first 4 blocks. 339* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 340*/ 341 paddd ONE(%rip), \XMM0 # INCR Y0 342 movdqa \XMM0, \XMM1 343 movdqa SHUF_MASK(%rip), %xmm14 344 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 345 346 paddd ONE(%rip), \XMM0 # INCR Y0 347 movdqa \XMM0, \XMM2 348 movdqa SHUF_MASK(%rip), %xmm14 349 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 350 351 paddd ONE(%rip), \XMM0 # INCR Y0 352 movdqa \XMM0, \XMM3 353 movdqa SHUF_MASK(%rip), %xmm14 354 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 355 356 paddd ONE(%rip), \XMM0 # INCR Y0 357 movdqa \XMM0, \XMM4 358 movdqa SHUF_MASK(%rip), %xmm14 359 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 360 361 pxor 16*0(%arg1), \XMM1 362 pxor 16*0(%arg1), \XMM2 363 pxor 16*0(%arg1), \XMM3 364 pxor 16*0(%arg1), \XMM4 365 movdqa \TMP3, \TMP5 366 pshufd $78, \TMP3, \TMP1 367 pxor \TMP3, \TMP1 368 movdqa \TMP1, HashKey_k(%rsp) 369 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 370# TMP5 = HashKey^2<<1 (mod poly) 371 movdqa \TMP5, HashKey_2(%rsp) 372# HashKey_2 = HashKey^2<<1 (mod poly) 373 pshufd $78, \TMP5, \TMP1 374 pxor \TMP5, \TMP1 375 movdqa \TMP1, HashKey_2_k(%rsp) 376.irpc index, 1234 # do 4 rounds 377 movaps 0x10*\index(%arg1), \TMP1 378 AESENC \TMP1, \XMM1 379 AESENC \TMP1, \XMM2 380 AESENC \TMP1, \XMM3 381 AESENC \TMP1, \XMM4 382.endr 383 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 384# TMP5 = HashKey^3<<1 (mod poly) 385 movdqa \TMP5, HashKey_3(%rsp) 386 pshufd $78, \TMP5, \TMP1 387 pxor \TMP5, \TMP1 388 movdqa \TMP1, HashKey_3_k(%rsp) 389.irpc index, 56789 # do next 5 rounds 390 movaps 0x10*\index(%arg1), \TMP1 391 AESENC \TMP1, \XMM1 392 AESENC \TMP1, \XMM2 393 AESENC \TMP1, \XMM3 394 AESENC \TMP1, \XMM4 395.endr 396 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 397# TMP5 = HashKey^3<<1 (mod poly) 398 movdqa \TMP5, HashKey_4(%rsp) 399 pshufd $78, \TMP5, \TMP1 400 pxor \TMP5, \TMP1 401 movdqa \TMP1, HashKey_4_k(%rsp) 402 movaps 0xa0(%arg1), \TMP2 403 AESENCLAST \TMP2, \XMM1 404 AESENCLAST \TMP2, \XMM2 405 AESENCLAST \TMP2, \XMM3 406 AESENCLAST \TMP2, \XMM4 407 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 408 pxor \TMP1, \XMM1 409 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 410 movdqa \TMP1, \XMM1 411 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 412 pxor \TMP1, \XMM2 413 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 414 movdqa \TMP1, \XMM2 415 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 416 pxor \TMP1, \XMM3 417 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 418 movdqa \TMP1, \XMM3 419 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 420 pxor \TMP1, \XMM4 421 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 422 movdqa \TMP1, \XMM4 423 add $64, %r11 424 movdqa SHUF_MASK(%rip), %xmm14 425 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 426 pxor \XMMDst, \XMM1 427# combine GHASHed value with the corresponding ciphertext 428 movdqa SHUF_MASK(%rip), %xmm14 429 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 430 movdqa SHUF_MASK(%rip), %xmm14 431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 432 movdqa SHUF_MASK(%rip), %xmm14 433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 434 435_initial_blocks_done\num_initial_blocks\operation: 436 437.endm 438 439 440/* 441* if a = number of total plaintext bytes 442* b = floor(a/16) 443* num_initial_blocks = b mod 4 444* encrypt the initial num_initial_blocks blocks and apply ghash on 445* the ciphertext 446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 447* are clobbered 448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 449*/ 450 451 452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 454 mov arg7, %r10 # %r10 = AAD 455 mov arg8, %r12 # %r12 = aadLen 456 mov %r12, %r11 457 pxor %xmm\i, %xmm\i 458_get_AAD_loop\num_initial_blocks\operation: 459 movd (%r10), \TMP1 460 pslldq $12, \TMP1 461 psrldq $4, %xmm\i 462 pxor \TMP1, %xmm\i 463 add $4, %r10 464 sub $4, %r12 465 jne _get_AAD_loop\num_initial_blocks\operation 466 cmp $16, %r11 467 je _get_AAD_loop2_done\num_initial_blocks\operation 468 mov $16, %r12 469_get_AAD_loop2\num_initial_blocks\operation: 470 psrldq $4, %xmm\i 471 sub $4, %r12 472 cmp %r11, %r12 473 jne _get_AAD_loop2\num_initial_blocks\operation 474_get_AAD_loop2_done\num_initial_blocks\operation: 475 movdqa SHUF_MASK(%rip), %xmm14 476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 477 478 xor %r11, %r11 # initialise the data pointer offset as zero 479 480 # start AES for num_initial_blocks blocks 481 482 mov %arg5, %rax # %rax = *Y0 483 movdqu (%rax), \XMM0 # XMM0 = Y0 484 movdqa SHUF_MASK(%rip), %xmm14 485 PSHUFB_XMM %xmm14, \XMM0 486 487.if (\i == 5) || (\i == 6) || (\i == 7) 488.irpc index, \i_seq 489 paddd ONE(%rip), \XMM0 # INCR Y0 490 movdqa \XMM0, %xmm\index 491 movdqa SHUF_MASK(%rip), %xmm14 492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 493 494.endr 495.irpc index, \i_seq 496 pxor 16*0(%arg1), %xmm\index 497.endr 498.irpc index, \i_seq 499 movaps 0x10(%rdi), \TMP1 500 AESENC \TMP1, %xmm\index # Round 1 501.endr 502.irpc index, \i_seq 503 movaps 0x20(%arg1), \TMP1 504 AESENC \TMP1, %xmm\index # Round 2 505.endr 506.irpc index, \i_seq 507 movaps 0x30(%arg1), \TMP1 508 AESENC \TMP1, %xmm\index # Round 2 509.endr 510.irpc index, \i_seq 511 movaps 0x40(%arg1), \TMP1 512 AESENC \TMP1, %xmm\index # Round 2 513.endr 514.irpc index, \i_seq 515 movaps 0x50(%arg1), \TMP1 516 AESENC \TMP1, %xmm\index # Round 2 517.endr 518.irpc index, \i_seq 519 movaps 0x60(%arg1), \TMP1 520 AESENC \TMP1, %xmm\index # Round 2 521.endr 522.irpc index, \i_seq 523 movaps 0x70(%arg1), \TMP1 524 AESENC \TMP1, %xmm\index # Round 2 525.endr 526.irpc index, \i_seq 527 movaps 0x80(%arg1), \TMP1 528 AESENC \TMP1, %xmm\index # Round 2 529.endr 530.irpc index, \i_seq 531 movaps 0x90(%arg1), \TMP1 532 AESENC \TMP1, %xmm\index # Round 2 533.endr 534.irpc index, \i_seq 535 movaps 0xa0(%arg1), \TMP1 536 AESENCLAST \TMP1, %xmm\index # Round 10 537.endr 538.irpc index, \i_seq 539 movdqu (%arg3 , %r11, 1), \TMP1 540 pxor \TMP1, %xmm\index 541 movdqu %xmm\index, (%arg2 , %r11, 1) 542 # write back plaintext/ciphertext for num_initial_blocks 543 add $16, %r11 544 545 movdqa SHUF_MASK(%rip), %xmm14 546 PSHUFB_XMM %xmm14, %xmm\index 547 548 # prepare plaintext/ciphertext for GHASH computation 549.endr 550.endif 551 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 552 # apply GHASH on num_initial_blocks blocks 553 554.if \i == 5 555 pxor %xmm5, %xmm6 556 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 557 pxor %xmm6, %xmm7 558 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 559 pxor %xmm7, %xmm8 560 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 561.elseif \i == 6 562 pxor %xmm6, %xmm7 563 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 564 pxor %xmm7, %xmm8 565 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 566.elseif \i == 7 567 pxor %xmm7, %xmm8 568 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 569.endif 570 cmp $64, %r13 571 jl _initial_blocks_done\num_initial_blocks\operation 572 # no need for precomputed values 573/* 574* 575* Precomputations for HashKey parallel with encryption of first 4 blocks. 576* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 577*/ 578 paddd ONE(%rip), \XMM0 # INCR Y0 579 movdqa \XMM0, \XMM1 580 movdqa SHUF_MASK(%rip), %xmm14 581 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 582 583 paddd ONE(%rip), \XMM0 # INCR Y0 584 movdqa \XMM0, \XMM2 585 movdqa SHUF_MASK(%rip), %xmm14 586 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 587 588 paddd ONE(%rip), \XMM0 # INCR Y0 589 movdqa \XMM0, \XMM3 590 movdqa SHUF_MASK(%rip), %xmm14 591 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 592 593 paddd ONE(%rip), \XMM0 # INCR Y0 594 movdqa \XMM0, \XMM4 595 movdqa SHUF_MASK(%rip), %xmm14 596 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 597 598 pxor 16*0(%arg1), \XMM1 599 pxor 16*0(%arg1), \XMM2 600 pxor 16*0(%arg1), \XMM3 601 pxor 16*0(%arg1), \XMM4 602 movdqa \TMP3, \TMP5 603 pshufd $78, \TMP3, \TMP1 604 pxor \TMP3, \TMP1 605 movdqa \TMP1, HashKey_k(%rsp) 606 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 607# TMP5 = HashKey^2<<1 (mod poly) 608 movdqa \TMP5, HashKey_2(%rsp) 609# HashKey_2 = HashKey^2<<1 (mod poly) 610 pshufd $78, \TMP5, \TMP1 611 pxor \TMP5, \TMP1 612 movdqa \TMP1, HashKey_2_k(%rsp) 613.irpc index, 1234 # do 4 rounds 614 movaps 0x10*\index(%arg1), \TMP1 615 AESENC \TMP1, \XMM1 616 AESENC \TMP1, \XMM2 617 AESENC \TMP1, \XMM3 618 AESENC \TMP1, \XMM4 619.endr 620 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 621# TMP5 = HashKey^3<<1 (mod poly) 622 movdqa \TMP5, HashKey_3(%rsp) 623 pshufd $78, \TMP5, \TMP1 624 pxor \TMP5, \TMP1 625 movdqa \TMP1, HashKey_3_k(%rsp) 626.irpc index, 56789 # do next 5 rounds 627 movaps 0x10*\index(%arg1), \TMP1 628 AESENC \TMP1, \XMM1 629 AESENC \TMP1, \XMM2 630 AESENC \TMP1, \XMM3 631 AESENC \TMP1, \XMM4 632.endr 633 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 634# TMP5 = HashKey^3<<1 (mod poly) 635 movdqa \TMP5, HashKey_4(%rsp) 636 pshufd $78, \TMP5, \TMP1 637 pxor \TMP5, \TMP1 638 movdqa \TMP1, HashKey_4_k(%rsp) 639 movaps 0xa0(%arg1), \TMP2 640 AESENCLAST \TMP2, \XMM1 641 AESENCLAST \TMP2, \XMM2 642 AESENCLAST \TMP2, \XMM3 643 AESENCLAST \TMP2, \XMM4 644 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 645 pxor \TMP1, \XMM1 646 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 647 pxor \TMP1, \XMM2 648 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 649 pxor \TMP1, \XMM3 650 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 651 pxor \TMP1, \XMM4 652 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 653 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 654 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 655 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 656 657 add $64, %r11 658 movdqa SHUF_MASK(%rip), %xmm14 659 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 660 pxor \XMMDst, \XMM1 661# combine GHASHed value with the corresponding ciphertext 662 movdqa SHUF_MASK(%rip), %xmm14 663 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 664 movdqa SHUF_MASK(%rip), %xmm14 665 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 666 movdqa SHUF_MASK(%rip), %xmm14 667 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 668 669_initial_blocks_done\num_initial_blocks\operation: 670 671.endm 672 673/* 674* encrypt 4 blocks at a time 675* ghash the 4 previously encrypted ciphertext blocks 676* arg1, %arg2, %arg3 are used as pointers only, not modified 677* %r11 is the data offset value 678*/ 679.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 680TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 681 682 movdqa \XMM1, \XMM5 683 movdqa \XMM2, \XMM6 684 movdqa \XMM3, \XMM7 685 movdqa \XMM4, \XMM8 686 687 movdqa SHUF_MASK(%rip), %xmm15 688 # multiply TMP5 * HashKey using karatsuba 689 690 movdqa \XMM5, \TMP4 691 pshufd $78, \XMM5, \TMP6 692 pxor \XMM5, \TMP6 693 paddd ONE(%rip), \XMM0 # INCR CNT 694 movdqa HashKey_4(%rsp), \TMP5 695 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 696 movdqa \XMM0, \XMM1 697 paddd ONE(%rip), \XMM0 # INCR CNT 698 movdqa \XMM0, \XMM2 699 paddd ONE(%rip), \XMM0 # INCR CNT 700 movdqa \XMM0, \XMM3 701 paddd ONE(%rip), \XMM0 # INCR CNT 702 movdqa \XMM0, \XMM4 703 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 704 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 705 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 706 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 707 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 708 709 pxor (%arg1), \XMM1 710 pxor (%arg1), \XMM2 711 pxor (%arg1), \XMM3 712 pxor (%arg1), \XMM4 713 movdqa HashKey_4_k(%rsp), \TMP5 714 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 715 movaps 0x10(%arg1), \TMP1 716 AESENC \TMP1, \XMM1 # Round 1 717 AESENC \TMP1, \XMM2 718 AESENC \TMP1, \XMM3 719 AESENC \TMP1, \XMM4 720 movaps 0x20(%arg1), \TMP1 721 AESENC \TMP1, \XMM1 # Round 2 722 AESENC \TMP1, \XMM2 723 AESENC \TMP1, \XMM3 724 AESENC \TMP1, \XMM4 725 movdqa \XMM6, \TMP1 726 pshufd $78, \XMM6, \TMP2 727 pxor \XMM6, \TMP2 728 movdqa HashKey_3(%rsp), \TMP5 729 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 730 movaps 0x30(%arg1), \TMP3 731 AESENC \TMP3, \XMM1 # Round 3 732 AESENC \TMP3, \XMM2 733 AESENC \TMP3, \XMM3 734 AESENC \TMP3, \XMM4 735 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 736 movaps 0x40(%arg1), \TMP3 737 AESENC \TMP3, \XMM1 # Round 4 738 AESENC \TMP3, \XMM2 739 AESENC \TMP3, \XMM3 740 AESENC \TMP3, \XMM4 741 movdqa HashKey_3_k(%rsp), \TMP5 742 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 743 movaps 0x50(%arg1), \TMP3 744 AESENC \TMP3, \XMM1 # Round 5 745 AESENC \TMP3, \XMM2 746 AESENC \TMP3, \XMM3 747 AESENC \TMP3, \XMM4 748 pxor \TMP1, \TMP4 749# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 750 pxor \XMM6, \XMM5 751 pxor \TMP2, \TMP6 752 movdqa \XMM7, \TMP1 753 pshufd $78, \XMM7, \TMP2 754 pxor \XMM7, \TMP2 755 movdqa HashKey_2(%rsp ), \TMP5 756 757 # Multiply TMP5 * HashKey using karatsuba 758 759 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 760 movaps 0x60(%arg1), \TMP3 761 AESENC \TMP3, \XMM1 # Round 6 762 AESENC \TMP3, \XMM2 763 AESENC \TMP3, \XMM3 764 AESENC \TMP3, \XMM4 765 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 766 movaps 0x70(%arg1), \TMP3 767 AESENC \TMP3, \XMM1 # Round 7 768 AESENC \TMP3, \XMM2 769 AESENC \TMP3, \XMM3 770 AESENC \TMP3, \XMM4 771 movdqa HashKey_2_k(%rsp), \TMP5 772 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 773 movaps 0x80(%arg1), \TMP3 774 AESENC \TMP3, \XMM1 # Round 8 775 AESENC \TMP3, \XMM2 776 AESENC \TMP3, \XMM3 777 AESENC \TMP3, \XMM4 778 pxor \TMP1, \TMP4 779# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 780 pxor \XMM7, \XMM5 781 pxor \TMP2, \TMP6 782 783 # Multiply XMM8 * HashKey 784 # XMM8 and TMP5 hold the values for the two operands 785 786 movdqa \XMM8, \TMP1 787 pshufd $78, \XMM8, \TMP2 788 pxor \XMM8, \TMP2 789 movdqa HashKey(%rsp), \TMP5 790 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 791 movaps 0x90(%arg1), \TMP3 792 AESENC \TMP3, \XMM1 # Round 9 793 AESENC \TMP3, \XMM2 794 AESENC \TMP3, \XMM3 795 AESENC \TMP3, \XMM4 796 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 797 movaps 0xa0(%arg1), \TMP3 798 AESENCLAST \TMP3, \XMM1 # Round 10 799 AESENCLAST \TMP3, \XMM2 800 AESENCLAST \TMP3, \XMM3 801 AESENCLAST \TMP3, \XMM4 802 movdqa HashKey_k(%rsp), \TMP5 803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 804 movdqu (%arg3,%r11,1), \TMP3 805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 806 movdqu 16(%arg3,%r11,1), \TMP3 807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 808 movdqu 32(%arg3,%r11,1), \TMP3 809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 810 movdqu 48(%arg3,%r11,1), \TMP3 811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 820 821 pxor \TMP4, \TMP1 822 pxor \XMM8, \XMM5 823 pxor \TMP6, \TMP2 824 pxor \TMP1, \TMP2 825 pxor \XMM5, \TMP2 826 movdqa \TMP2, \TMP3 827 pslldq $8, \TMP3 # left shift TMP3 2 DWs 828 psrldq $8, \TMP2 # right shift TMP2 2 DWs 829 pxor \TMP3, \XMM5 830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 831 832 # first phase of reduction 833 834 movdqa \XMM5, \TMP2 835 movdqa \XMM5, \TMP3 836 movdqa \XMM5, \TMP4 837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 838 pslld $31, \TMP2 # packed right shift << 31 839 pslld $30, \TMP3 # packed right shift << 30 840 pslld $25, \TMP4 # packed right shift << 25 841 pxor \TMP3, \TMP2 # xor the shifted versions 842 pxor \TMP4, \TMP2 843 movdqa \TMP2, \TMP5 844 psrldq $4, \TMP5 # right shift T5 1 DW 845 pslldq $12, \TMP2 # left shift T2 3 DWs 846 pxor \TMP2, \XMM5 847 848 # second phase of reduction 849 850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 851 movdqa \XMM5,\TMP3 852 movdqa \XMM5,\TMP4 853 psrld $1, \TMP2 # packed left shift >>1 854 psrld $2, \TMP3 # packed left shift >>2 855 psrld $7, \TMP4 # packed left shift >>7 856 pxor \TMP3,\TMP2 # xor the shifted versions 857 pxor \TMP4,\TMP2 858 pxor \TMP5, \TMP2 859 pxor \TMP2, \XMM5 860 pxor \TMP1, \XMM5 # result is in TMP1 861 862 pxor \XMM5, \XMM1 863.endm 864 865/* 866* decrypt 4 blocks at a time 867* ghash the 4 previously decrypted ciphertext blocks 868* arg1, %arg2, %arg3 are used as pointers only, not modified 869* %r11 is the data offset value 870*/ 871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 873 874 movdqa \XMM1, \XMM5 875 movdqa \XMM2, \XMM6 876 movdqa \XMM3, \XMM7 877 movdqa \XMM4, \XMM8 878 879 movdqa SHUF_MASK(%rip), %xmm15 880 # multiply TMP5 * HashKey using karatsuba 881 882 movdqa \XMM5, \TMP4 883 pshufd $78, \XMM5, \TMP6 884 pxor \XMM5, \TMP6 885 paddd ONE(%rip), \XMM0 # INCR CNT 886 movdqa HashKey_4(%rsp), \TMP5 887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 888 movdqa \XMM0, \XMM1 889 paddd ONE(%rip), \XMM0 # INCR CNT 890 movdqa \XMM0, \XMM2 891 paddd ONE(%rip), \XMM0 # INCR CNT 892 movdqa \XMM0, \XMM3 893 paddd ONE(%rip), \XMM0 # INCR CNT 894 movdqa \XMM0, \XMM4 895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 900 901 pxor (%arg1), \XMM1 902 pxor (%arg1), \XMM2 903 pxor (%arg1), \XMM3 904 pxor (%arg1), \XMM4 905 movdqa HashKey_4_k(%rsp), \TMP5 906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 907 movaps 0x10(%arg1), \TMP1 908 AESENC \TMP1, \XMM1 # Round 1 909 AESENC \TMP1, \XMM2 910 AESENC \TMP1, \XMM3 911 AESENC \TMP1, \XMM4 912 movaps 0x20(%arg1), \TMP1 913 AESENC \TMP1, \XMM1 # Round 2 914 AESENC \TMP1, \XMM2 915 AESENC \TMP1, \XMM3 916 AESENC \TMP1, \XMM4 917 movdqa \XMM6, \TMP1 918 pshufd $78, \XMM6, \TMP2 919 pxor \XMM6, \TMP2 920 movdqa HashKey_3(%rsp), \TMP5 921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 922 movaps 0x30(%arg1), \TMP3 923 AESENC \TMP3, \XMM1 # Round 3 924 AESENC \TMP3, \XMM2 925 AESENC \TMP3, \XMM3 926 AESENC \TMP3, \XMM4 927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 928 movaps 0x40(%arg1), \TMP3 929 AESENC \TMP3, \XMM1 # Round 4 930 AESENC \TMP3, \XMM2 931 AESENC \TMP3, \XMM3 932 AESENC \TMP3, \XMM4 933 movdqa HashKey_3_k(%rsp), \TMP5 934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 935 movaps 0x50(%arg1), \TMP3 936 AESENC \TMP3, \XMM1 # Round 5 937 AESENC \TMP3, \XMM2 938 AESENC \TMP3, \XMM3 939 AESENC \TMP3, \XMM4 940 pxor \TMP1, \TMP4 941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 942 pxor \XMM6, \XMM5 943 pxor \TMP2, \TMP6 944 movdqa \XMM7, \TMP1 945 pshufd $78, \XMM7, \TMP2 946 pxor \XMM7, \TMP2 947 movdqa HashKey_2(%rsp ), \TMP5 948 949 # Multiply TMP5 * HashKey using karatsuba 950 951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 952 movaps 0x60(%arg1), \TMP3 953 AESENC \TMP3, \XMM1 # Round 6 954 AESENC \TMP3, \XMM2 955 AESENC \TMP3, \XMM3 956 AESENC \TMP3, \XMM4 957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 958 movaps 0x70(%arg1), \TMP3 959 AESENC \TMP3, \XMM1 # Round 7 960 AESENC \TMP3, \XMM2 961 AESENC \TMP3, \XMM3 962 AESENC \TMP3, \XMM4 963 movdqa HashKey_2_k(%rsp), \TMP5 964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 965 movaps 0x80(%arg1), \TMP3 966 AESENC \TMP3, \XMM1 # Round 8 967 AESENC \TMP3, \XMM2 968 AESENC \TMP3, \XMM3 969 AESENC \TMP3, \XMM4 970 pxor \TMP1, \TMP4 971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 972 pxor \XMM7, \XMM5 973 pxor \TMP2, \TMP6 974 975 # Multiply XMM8 * HashKey 976 # XMM8 and TMP5 hold the values for the two operands 977 978 movdqa \XMM8, \TMP1 979 pshufd $78, \XMM8, \TMP2 980 pxor \XMM8, \TMP2 981 movdqa HashKey(%rsp), \TMP5 982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 983 movaps 0x90(%arg1), \TMP3 984 AESENC \TMP3, \XMM1 # Round 9 985 AESENC \TMP3, \XMM2 986 AESENC \TMP3, \XMM3 987 AESENC \TMP3, \XMM4 988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 989 movaps 0xa0(%arg1), \TMP3 990 AESENCLAST \TMP3, \XMM1 # Round 10 991 AESENCLAST \TMP3, \XMM2 992 AESENCLAST \TMP3, \XMM3 993 AESENCLAST \TMP3, \XMM4 994 movdqa HashKey_k(%rsp), \TMP5 995 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 996 movdqu (%arg3,%r11,1), \TMP3 997 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 998 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 999 movdqa \TMP3, \XMM1 1000 movdqu 16(%arg3,%r11,1), \TMP3 1001 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1002 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1003 movdqa \TMP3, \XMM2 1004 movdqu 32(%arg3,%r11,1), \TMP3 1005 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1006 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1007 movdqa \TMP3, \XMM3 1008 movdqu 48(%arg3,%r11,1), \TMP3 1009 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1010 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1011 movdqa \TMP3, \XMM4 1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1016 1017 pxor \TMP4, \TMP1 1018 pxor \XMM8, \XMM5 1019 pxor \TMP6, \TMP2 1020 pxor \TMP1, \TMP2 1021 pxor \XMM5, \TMP2 1022 movdqa \TMP2, \TMP3 1023 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1024 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1025 pxor \TMP3, \XMM5 1026 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1027 1028 # first phase of reduction 1029 1030 movdqa \XMM5, \TMP2 1031 movdqa \XMM5, \TMP3 1032 movdqa \XMM5, \TMP4 1033# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1034 pslld $31, \TMP2 # packed right shift << 31 1035 pslld $30, \TMP3 # packed right shift << 30 1036 pslld $25, \TMP4 # packed right shift << 25 1037 pxor \TMP3, \TMP2 # xor the shifted versions 1038 pxor \TMP4, \TMP2 1039 movdqa \TMP2, \TMP5 1040 psrldq $4, \TMP5 # right shift T5 1 DW 1041 pslldq $12, \TMP2 # left shift T2 3 DWs 1042 pxor \TMP2, \XMM5 1043 1044 # second phase of reduction 1045 1046 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1047 movdqa \XMM5,\TMP3 1048 movdqa \XMM5,\TMP4 1049 psrld $1, \TMP2 # packed left shift >>1 1050 psrld $2, \TMP3 # packed left shift >>2 1051 psrld $7, \TMP4 # packed left shift >>7 1052 pxor \TMP3,\TMP2 # xor the shifted versions 1053 pxor \TMP4,\TMP2 1054 pxor \TMP5, \TMP2 1055 pxor \TMP2, \XMM5 1056 pxor \TMP1, \XMM5 # result is in TMP1 1057 1058 pxor \XMM5, \XMM1 1059.endm 1060 1061/* GHASH the last 4 ciphertext blocks. */ 1062.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1063TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1064 1065 # Multiply TMP6 * HashKey (using Karatsuba) 1066 1067 movdqa \XMM1, \TMP6 1068 pshufd $78, \XMM1, \TMP2 1069 pxor \XMM1, \TMP2 1070 movdqa HashKey_4(%rsp), \TMP5 1071 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1072 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1073 movdqa HashKey_4_k(%rsp), \TMP4 1074 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1075 movdqa \XMM1, \XMMDst 1076 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1077 1078 # Multiply TMP1 * HashKey (using Karatsuba) 1079 1080 movdqa \XMM2, \TMP1 1081 pshufd $78, \XMM2, \TMP2 1082 pxor \XMM2, \TMP2 1083 movdqa HashKey_3(%rsp), \TMP5 1084 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1085 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1086 movdqa HashKey_3_k(%rsp), \TMP4 1087 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1088 pxor \TMP1, \TMP6 1089 pxor \XMM2, \XMMDst 1090 pxor \TMP2, \XMM1 1091# results accumulated in TMP6, XMMDst, XMM1 1092 1093 # Multiply TMP1 * HashKey (using Karatsuba) 1094 1095 movdqa \XMM3, \TMP1 1096 pshufd $78, \XMM3, \TMP2 1097 pxor \XMM3, \TMP2 1098 movdqa HashKey_2(%rsp), \TMP5 1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1100 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1101 movdqa HashKey_2_k(%rsp), \TMP4 1102 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1103 pxor \TMP1, \TMP6 1104 pxor \XMM3, \XMMDst 1105 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1106 1107 # Multiply TMP1 * HashKey (using Karatsuba) 1108 movdqa \XMM4, \TMP1 1109 pshufd $78, \XMM4, \TMP2 1110 pxor \XMM4, \TMP2 1111 movdqa HashKey(%rsp), \TMP5 1112 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1113 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1114 movdqa HashKey_k(%rsp), \TMP4 1115 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1116 pxor \TMP1, \TMP6 1117 pxor \XMM4, \XMMDst 1118 pxor \XMM1, \TMP2 1119 pxor \TMP6, \TMP2 1120 pxor \XMMDst, \TMP2 1121 # middle section of the temp results combined as in karatsuba algorithm 1122 movdqa \TMP2, \TMP4 1123 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1124 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1125 pxor \TMP4, \XMMDst 1126 pxor \TMP2, \TMP6 1127# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1128 # first phase of the reduction 1129 movdqa \XMMDst, \TMP2 1130 movdqa \XMMDst, \TMP3 1131 movdqa \XMMDst, \TMP4 1132# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1133 pslld $31, \TMP2 # packed right shifting << 31 1134 pslld $30, \TMP3 # packed right shifting << 30 1135 pslld $25, \TMP4 # packed right shifting << 25 1136 pxor \TMP3, \TMP2 # xor the shifted versions 1137 pxor \TMP4, \TMP2 1138 movdqa \TMP2, \TMP7 1139 psrldq $4, \TMP7 # right shift TMP7 1 DW 1140 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1141 pxor \TMP2, \XMMDst 1142 1143 # second phase of the reduction 1144 movdqa \XMMDst, \TMP2 1145 # make 3 copies of XMMDst for doing 3 shift operations 1146 movdqa \XMMDst, \TMP3 1147 movdqa \XMMDst, \TMP4 1148 psrld $1, \TMP2 # packed left shift >> 1 1149 psrld $2, \TMP3 # packed left shift >> 2 1150 psrld $7, \TMP4 # packed left shift >> 7 1151 pxor \TMP3, \TMP2 # xor the shifted versions 1152 pxor \TMP4, \TMP2 1153 pxor \TMP7, \TMP2 1154 pxor \TMP2, \XMMDst 1155 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1156.endm 1157 1158/* Encryption of a single block done*/ 1159.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1160 1161 pxor (%arg1), \XMM0 1162 movaps 16(%arg1), \TMP1 1163 AESENC \TMP1, \XMM0 1164 movaps 32(%arg1), \TMP1 1165 AESENC \TMP1, \XMM0 1166 movaps 48(%arg1), \TMP1 1167 AESENC \TMP1, \XMM0 1168 movaps 64(%arg1), \TMP1 1169 AESENC \TMP1, \XMM0 1170 movaps 80(%arg1), \TMP1 1171 AESENC \TMP1, \XMM0 1172 movaps 96(%arg1), \TMP1 1173 AESENC \TMP1, \XMM0 1174 movaps 112(%arg1), \TMP1 1175 AESENC \TMP1, \XMM0 1176 movaps 128(%arg1), \TMP1 1177 AESENC \TMP1, \XMM0 1178 movaps 144(%arg1), \TMP1 1179 AESENC \TMP1, \XMM0 1180 movaps 160(%arg1), \TMP1 1181 AESENCLAST \TMP1, \XMM0 1182.endm 1183 1184 1185/***************************************************************************** 1186* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1187* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1188* const u8 *in, // Ciphertext input 1189* u64 plaintext_len, // Length of data in bytes for decryption. 1190* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1191* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1192* // concatenated with 0x00000001. 16-byte aligned pointer. 1193* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1194* const u8 *aad, // Additional Authentication Data (AAD) 1195* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1196* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1197* // given authentication tag and only return the plaintext if they match. 1198* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1199* // (most likely), 12 or 8. 1200* 1201* Assumptions: 1202* 1203* keys: 1204* keys are pre-expanded and aligned to 16 bytes. we are using the first 1205* set of 11 keys in the data structure void *aes_ctx 1206* 1207* iv: 1208* 0 1 2 3 1209* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1210* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1211* | Salt (From the SA) | 1212* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1213* | Initialization Vector | 1214* | (This is the sequence number from IPSec header) | 1215* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1216* | 0x1 | 1217* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1218* 1219* 1220* 1221* AAD: 1222* AAD padded to 128 bits with 0 1223* for example, assume AAD is a u32 vector 1224* 1225* if AAD is 8 bytes: 1226* AAD[3] = {A0, A1}; 1227* padded AAD in xmm register = {A1 A0 0 0} 1228* 1229* 0 1 2 3 1230* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1232* | SPI (A1) | 1233* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1234* | 32-bit Sequence Number (A0) | 1235* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1236* | 0x0 | 1237* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1238* 1239* AAD Format with 32-bit Sequence Number 1240* 1241* if AAD is 12 bytes: 1242* AAD[3] = {A0, A1, A2}; 1243* padded AAD in xmm register = {A2 A1 A0 0} 1244* 1245* 0 1 2 3 1246* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1247* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1248* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1249* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1250* | SPI (A2) | 1251* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1252* | 64-bit Extended Sequence Number {A1,A0} | 1253* | | 1254* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1255* | 0x0 | 1256* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1257* 1258* AAD Format with 64-bit Extended Sequence Number 1259* 1260* aadLen: 1261* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1262* The code supports 16 too but for other sizes, the code will fail. 1263* 1264* TLen: 1265* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1266* For other sizes, the code will fail. 1267* 1268* poly = x^128 + x^127 + x^126 + x^121 + 1 1269* 1270*****************************************************************************/ 1271ENTRY(aesni_gcm_dec) 1272 push %r12 1273 push %r13 1274 push %r14 1275 mov %rsp, %r14 1276/* 1277* states of %xmm registers %xmm6:%xmm15 not saved 1278* all %xmm registers are clobbered 1279*/ 1280 sub $VARIABLE_OFFSET, %rsp 1281 and $~63, %rsp # align rsp to 64 bytes 1282 mov %arg6, %r12 1283 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1284 movdqa SHUF_MASK(%rip), %xmm2 1285 PSHUFB_XMM %xmm2, %xmm13 1286 1287 1288# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1289 1290 movdqa %xmm13, %xmm2 1291 psllq $1, %xmm13 1292 psrlq $63, %xmm2 1293 movdqa %xmm2, %xmm1 1294 pslldq $8, %xmm2 1295 psrldq $8, %xmm1 1296 por %xmm2, %xmm13 1297 1298 # Reduction 1299 1300 pshufd $0x24, %xmm1, %xmm2 1301 pcmpeqd TWOONE(%rip), %xmm2 1302 pand POLY(%rip), %xmm2 1303 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1304 1305 1306 # Decrypt first few blocks 1307 1308 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1309 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1310 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1311 mov %r13, %r12 1312 and $(3<<4), %r12 1313 jz _initial_num_blocks_is_0_decrypt 1314 cmp $(2<<4), %r12 1315 jb _initial_num_blocks_is_1_decrypt 1316 je _initial_num_blocks_is_2_decrypt 1317_initial_num_blocks_is_3_decrypt: 1318 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1320 sub $48, %r13 1321 jmp _initial_blocks_decrypted 1322_initial_num_blocks_is_2_decrypt: 1323 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1325 sub $32, %r13 1326 jmp _initial_blocks_decrypted 1327_initial_num_blocks_is_1_decrypt: 1328 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1330 sub $16, %r13 1331 jmp _initial_blocks_decrypted 1332_initial_num_blocks_is_0_decrypt: 1333 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1334%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1335_initial_blocks_decrypted: 1336 cmp $0, %r13 1337 je _zero_cipher_left_decrypt 1338 sub $64, %r13 1339 je _four_cipher_left_decrypt 1340_decrypt_by_4: 1341 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1342%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1343 add $64, %r11 1344 sub $64, %r13 1345 jne _decrypt_by_4 1346_four_cipher_left_decrypt: 1347 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1348%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1349_zero_cipher_left_decrypt: 1350 mov %arg4, %r13 1351 and $15, %r13 # %r13 = arg4 (mod 16) 1352 je _multiple_of_16_bytes_decrypt 1353 1354 # Handle the last <16 byte block separately 1355 1356 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1357 movdqa SHUF_MASK(%rip), %xmm10 1358 PSHUFB_XMM %xmm10, %xmm0 1359 1360 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1361 sub $16, %r11 1362 add %r13, %r11 1363 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1364 lea SHIFT_MASK+16(%rip), %r12 1365 sub %r13, %r12 1366# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1367# (%r13 is the number of bytes in plaintext mod 16) 1368 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1369 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1370 1371 movdqa %xmm1, %xmm2 1372 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1373 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1374 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1375 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1376 pand %xmm1, %xmm2 1377 movdqa SHUF_MASK(%rip), %xmm10 1378 PSHUFB_XMM %xmm10 ,%xmm2 1379 1380 pxor %xmm2, %xmm8 1381 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1382 # GHASH computation for the last <16 byte block 1383 sub %r13, %r11 1384 add $16, %r11 1385 1386 # output %r13 bytes 1387 MOVQ_R64_XMM %xmm0, %rax 1388 cmp $8, %r13 1389 jle _less_than_8_bytes_left_decrypt 1390 mov %rax, (%arg2 , %r11, 1) 1391 add $8, %r11 1392 psrldq $8, %xmm0 1393 MOVQ_R64_XMM %xmm0, %rax 1394 sub $8, %r13 1395_less_than_8_bytes_left_decrypt: 1396 mov %al, (%arg2, %r11, 1) 1397 add $1, %r11 1398 shr $8, %rax 1399 sub $1, %r13 1400 jne _less_than_8_bytes_left_decrypt 1401_multiple_of_16_bytes_decrypt: 1402 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1403 shl $3, %r12 # convert into number of bits 1404 movd %r12d, %xmm15 # len(A) in %xmm15 1405 shl $3, %arg4 # len(C) in bits (*128) 1406 MOVQ_R64_XMM %arg4, %xmm1 1407 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1408 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1409 pxor %xmm15, %xmm8 1410 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1411 # final GHASH computation 1412 movdqa SHUF_MASK(%rip), %xmm10 1413 PSHUFB_XMM %xmm10, %xmm8 1414 1415 mov %arg5, %rax # %rax = *Y0 1416 movdqu (%rax), %xmm0 # %xmm0 = Y0 1417 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1418 pxor %xmm8, %xmm0 1419_return_T_decrypt: 1420 mov arg9, %r10 # %r10 = authTag 1421 mov arg10, %r11 # %r11 = auth_tag_len 1422 cmp $16, %r11 1423 je _T_16_decrypt 1424 cmp $12, %r11 1425 je _T_12_decrypt 1426_T_8_decrypt: 1427 MOVQ_R64_XMM %xmm0, %rax 1428 mov %rax, (%r10) 1429 jmp _return_T_done_decrypt 1430_T_12_decrypt: 1431 MOVQ_R64_XMM %xmm0, %rax 1432 mov %rax, (%r10) 1433 psrldq $8, %xmm0 1434 movd %xmm0, %eax 1435 mov %eax, 8(%r10) 1436 jmp _return_T_done_decrypt 1437_T_16_decrypt: 1438 movdqu %xmm0, (%r10) 1439_return_T_done_decrypt: 1440 mov %r14, %rsp 1441 pop %r14 1442 pop %r13 1443 pop %r12 1444 ret 1445ENDPROC(aesni_gcm_dec) 1446 1447 1448/***************************************************************************** 1449* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1450* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1451* const u8 *in, // Plaintext input 1452* u64 plaintext_len, // Length of data in bytes for encryption. 1453* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1454* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1455* // concatenated with 0x00000001. 16-byte aligned pointer. 1456* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1457* const u8 *aad, // Additional Authentication Data (AAD) 1458* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1459* u8 *auth_tag, // Authenticated Tag output. 1460* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1461* // 12 or 8. 1462* 1463* Assumptions: 1464* 1465* keys: 1466* keys are pre-expanded and aligned to 16 bytes. we are using the 1467* first set of 11 keys in the data structure void *aes_ctx 1468* 1469* 1470* iv: 1471* 0 1 2 3 1472* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1473* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1474* | Salt (From the SA) | 1475* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1476* | Initialization Vector | 1477* | (This is the sequence number from IPSec header) | 1478* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1479* | 0x1 | 1480* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1481* 1482* 1483* 1484* AAD: 1485* AAD padded to 128 bits with 0 1486* for example, assume AAD is a u32 vector 1487* 1488* if AAD is 8 bytes: 1489* AAD[3] = {A0, A1}; 1490* padded AAD in xmm register = {A1 A0 0 0} 1491* 1492* 0 1 2 3 1493* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1495* | SPI (A1) | 1496* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1497* | 32-bit Sequence Number (A0) | 1498* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1499* | 0x0 | 1500* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1501* 1502* AAD Format with 32-bit Sequence Number 1503* 1504* if AAD is 12 bytes: 1505* AAD[3] = {A0, A1, A2}; 1506* padded AAD in xmm register = {A2 A1 A0 0} 1507* 1508* 0 1 2 3 1509* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1510* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1511* | SPI (A2) | 1512* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1513* | 64-bit Extended Sequence Number {A1,A0} | 1514* | | 1515* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1516* | 0x0 | 1517* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1518* 1519* AAD Format with 64-bit Extended Sequence Number 1520* 1521* aadLen: 1522* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1523* The code supports 16 too but for other sizes, the code will fail. 1524* 1525* TLen: 1526* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1527* For other sizes, the code will fail. 1528* 1529* poly = x^128 + x^127 + x^126 + x^121 + 1 1530***************************************************************************/ 1531ENTRY(aesni_gcm_enc) 1532 push %r12 1533 push %r13 1534 push %r14 1535 mov %rsp, %r14 1536# 1537# states of %xmm registers %xmm6:%xmm15 not saved 1538# all %xmm registers are clobbered 1539# 1540 sub $VARIABLE_OFFSET, %rsp 1541 and $~63, %rsp 1542 mov %arg6, %r12 1543 movdqu (%r12), %xmm13 1544 movdqa SHUF_MASK(%rip), %xmm2 1545 PSHUFB_XMM %xmm2, %xmm13 1546 1547 1548# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1549 1550 movdqa %xmm13, %xmm2 1551 psllq $1, %xmm13 1552 psrlq $63, %xmm2 1553 movdqa %xmm2, %xmm1 1554 pslldq $8, %xmm2 1555 psrldq $8, %xmm1 1556 por %xmm2, %xmm13 1557 1558 # reduce HashKey<<1 1559 1560 pshufd $0x24, %xmm1, %xmm2 1561 pcmpeqd TWOONE(%rip), %xmm2 1562 pand POLY(%rip), %xmm2 1563 pxor %xmm2, %xmm13 1564 movdqa %xmm13, HashKey(%rsp) 1565 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1566 and $-16, %r13 1567 mov %r13, %r12 1568 1569 # Encrypt first few blocks 1570 1571 and $(3<<4), %r12 1572 jz _initial_num_blocks_is_0_encrypt 1573 cmp $(2<<4), %r12 1574 jb _initial_num_blocks_is_1_encrypt 1575 je _initial_num_blocks_is_2_encrypt 1576_initial_num_blocks_is_3_encrypt: 1577 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1578%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1579 sub $48, %r13 1580 jmp _initial_blocks_encrypted 1581_initial_num_blocks_is_2_encrypt: 1582 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1583%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1584 sub $32, %r13 1585 jmp _initial_blocks_encrypted 1586_initial_num_blocks_is_1_encrypt: 1587 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1588%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1589 sub $16, %r13 1590 jmp _initial_blocks_encrypted 1591_initial_num_blocks_is_0_encrypt: 1592 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1593%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1594_initial_blocks_encrypted: 1595 1596 # Main loop - Encrypt remaining blocks 1597 1598 cmp $0, %r13 1599 je _zero_cipher_left_encrypt 1600 sub $64, %r13 1601 je _four_cipher_left_encrypt 1602_encrypt_by_4_encrypt: 1603 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1604%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1605 add $64, %r11 1606 sub $64, %r13 1607 jne _encrypt_by_4_encrypt 1608_four_cipher_left_encrypt: 1609 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1610%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1611_zero_cipher_left_encrypt: 1612 mov %arg4, %r13 1613 and $15, %r13 # %r13 = arg4 (mod 16) 1614 je _multiple_of_16_bytes_encrypt 1615 1616 # Handle the last <16 Byte block separately 1617 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1618 movdqa SHUF_MASK(%rip), %xmm10 1619 PSHUFB_XMM %xmm10, %xmm0 1620 1621 1622 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1623 sub $16, %r11 1624 add %r13, %r11 1625 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1626 lea SHIFT_MASK+16(%rip), %r12 1627 sub %r13, %r12 1628 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1629 # (%r13 is the number of bytes in plaintext mod 16) 1630 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1631 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1632 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1633 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1634 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1635 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1636 movdqa SHUF_MASK(%rip), %xmm10 1637 PSHUFB_XMM %xmm10,%xmm0 1638 1639 pxor %xmm0, %xmm8 1640 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1641 # GHASH computation for the last <16 byte block 1642 sub %r13, %r11 1643 add $16, %r11 1644 1645 movdqa SHUF_MASK(%rip), %xmm10 1646 PSHUFB_XMM %xmm10, %xmm0 1647 1648 # shuffle xmm0 back to output as ciphertext 1649 1650 # Output %r13 bytes 1651 MOVQ_R64_XMM %xmm0, %rax 1652 cmp $8, %r13 1653 jle _less_than_8_bytes_left_encrypt 1654 mov %rax, (%arg2 , %r11, 1) 1655 add $8, %r11 1656 psrldq $8, %xmm0 1657 MOVQ_R64_XMM %xmm0, %rax 1658 sub $8, %r13 1659_less_than_8_bytes_left_encrypt: 1660 mov %al, (%arg2, %r11, 1) 1661 add $1, %r11 1662 shr $8, %rax 1663 sub $1, %r13 1664 jne _less_than_8_bytes_left_encrypt 1665_multiple_of_16_bytes_encrypt: 1666 mov arg8, %r12 # %r12 = addLen (number of bytes) 1667 shl $3, %r12 1668 movd %r12d, %xmm15 # len(A) in %xmm15 1669 shl $3, %arg4 # len(C) in bits (*128) 1670 MOVQ_R64_XMM %arg4, %xmm1 1671 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1672 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1673 pxor %xmm15, %xmm8 1674 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1675 # final GHASH computation 1676 movdqa SHUF_MASK(%rip), %xmm10 1677 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1678 1679 mov %arg5, %rax # %rax = *Y0 1680 movdqu (%rax), %xmm0 # %xmm0 = Y0 1681 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1682 pxor %xmm8, %xmm0 1683_return_T_encrypt: 1684 mov arg9, %r10 # %r10 = authTag 1685 mov arg10, %r11 # %r11 = auth_tag_len 1686 cmp $16, %r11 1687 je _T_16_encrypt 1688 cmp $12, %r11 1689 je _T_12_encrypt 1690_T_8_encrypt: 1691 MOVQ_R64_XMM %xmm0, %rax 1692 mov %rax, (%r10) 1693 jmp _return_T_done_encrypt 1694_T_12_encrypt: 1695 MOVQ_R64_XMM %xmm0, %rax 1696 mov %rax, (%r10) 1697 psrldq $8, %xmm0 1698 movd %xmm0, %eax 1699 mov %eax, 8(%r10) 1700 jmp _return_T_done_encrypt 1701_T_16_encrypt: 1702 movdqu %xmm0, (%r10) 1703_return_T_done_encrypt: 1704 mov %r14, %rsp 1705 pop %r14 1706 pop %r13 1707 pop %r12 1708 ret 1709ENDPROC(aesni_gcm_enc) 1710 1711#endif 1712 1713 1714.align 4 1715_key_expansion_128: 1716_key_expansion_256a: 1717 pshufd $0b11111111, %xmm1, %xmm1 1718 shufps $0b00010000, %xmm0, %xmm4 1719 pxor %xmm4, %xmm0 1720 shufps $0b10001100, %xmm0, %xmm4 1721 pxor %xmm4, %xmm0 1722 pxor %xmm1, %xmm0 1723 movaps %xmm0, (TKEYP) 1724 add $0x10, TKEYP 1725 ret 1726ENDPROC(_key_expansion_128) 1727ENDPROC(_key_expansion_256a) 1728 1729.align 4 1730_key_expansion_192a: 1731 pshufd $0b01010101, %xmm1, %xmm1 1732 shufps $0b00010000, %xmm0, %xmm4 1733 pxor %xmm4, %xmm0 1734 shufps $0b10001100, %xmm0, %xmm4 1735 pxor %xmm4, %xmm0 1736 pxor %xmm1, %xmm0 1737 1738 movaps %xmm2, %xmm5 1739 movaps %xmm2, %xmm6 1740 pslldq $4, %xmm5 1741 pshufd $0b11111111, %xmm0, %xmm3 1742 pxor %xmm3, %xmm2 1743 pxor %xmm5, %xmm2 1744 1745 movaps %xmm0, %xmm1 1746 shufps $0b01000100, %xmm0, %xmm6 1747 movaps %xmm6, (TKEYP) 1748 shufps $0b01001110, %xmm2, %xmm1 1749 movaps %xmm1, 0x10(TKEYP) 1750 add $0x20, TKEYP 1751 ret 1752ENDPROC(_key_expansion_192a) 1753 1754.align 4 1755_key_expansion_192b: 1756 pshufd $0b01010101, %xmm1, %xmm1 1757 shufps $0b00010000, %xmm0, %xmm4 1758 pxor %xmm4, %xmm0 1759 shufps $0b10001100, %xmm0, %xmm4 1760 pxor %xmm4, %xmm0 1761 pxor %xmm1, %xmm0 1762 1763 movaps %xmm2, %xmm5 1764 pslldq $4, %xmm5 1765 pshufd $0b11111111, %xmm0, %xmm3 1766 pxor %xmm3, %xmm2 1767 pxor %xmm5, %xmm2 1768 1769 movaps %xmm0, (TKEYP) 1770 add $0x10, TKEYP 1771 ret 1772ENDPROC(_key_expansion_192b) 1773 1774.align 4 1775_key_expansion_256b: 1776 pshufd $0b10101010, %xmm1, %xmm1 1777 shufps $0b00010000, %xmm2, %xmm4 1778 pxor %xmm4, %xmm2 1779 shufps $0b10001100, %xmm2, %xmm4 1780 pxor %xmm4, %xmm2 1781 pxor %xmm1, %xmm2 1782 movaps %xmm2, (TKEYP) 1783 add $0x10, TKEYP 1784 ret 1785ENDPROC(_key_expansion_256b) 1786 1787/* 1788 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1789 * unsigned int key_len) 1790 */ 1791ENTRY(aesni_set_key) 1792#ifndef __x86_64__ 1793 pushl KEYP 1794 movl 8(%esp), KEYP # ctx 1795 movl 12(%esp), UKEYP # in_key 1796 movl 16(%esp), %edx # key_len 1797#endif 1798 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1799 movaps %xmm0, (KEYP) 1800 lea 0x10(KEYP), TKEYP # key addr 1801 movl %edx, 480(KEYP) 1802 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1803 cmp $24, %dl 1804 jb .Lenc_key128 1805 je .Lenc_key192 1806 movups 0x10(UKEYP), %xmm2 # other user key 1807 movaps %xmm2, (TKEYP) 1808 add $0x10, TKEYP 1809 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1810 call _key_expansion_256a 1811 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1812 call _key_expansion_256b 1813 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1814 call _key_expansion_256a 1815 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1816 call _key_expansion_256b 1817 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1818 call _key_expansion_256a 1819 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1820 call _key_expansion_256b 1821 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1822 call _key_expansion_256a 1823 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1824 call _key_expansion_256b 1825 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1826 call _key_expansion_256a 1827 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1828 call _key_expansion_256b 1829 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1830 call _key_expansion_256a 1831 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1832 call _key_expansion_256b 1833 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1834 call _key_expansion_256a 1835 jmp .Ldec_key 1836.Lenc_key192: 1837 movq 0x10(UKEYP), %xmm2 # other user key 1838 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1839 call _key_expansion_192a 1840 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1841 call _key_expansion_192b 1842 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1843 call _key_expansion_192a 1844 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1845 call _key_expansion_192b 1846 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1847 call _key_expansion_192a 1848 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1849 call _key_expansion_192b 1850 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1851 call _key_expansion_192a 1852 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1853 call _key_expansion_192b 1854 jmp .Ldec_key 1855.Lenc_key128: 1856 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1857 call _key_expansion_128 1858 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1859 call _key_expansion_128 1860 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1861 call _key_expansion_128 1862 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1863 call _key_expansion_128 1864 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1865 call _key_expansion_128 1866 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1867 call _key_expansion_128 1868 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1869 call _key_expansion_128 1870 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1871 call _key_expansion_128 1872 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1873 call _key_expansion_128 1874 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1875 call _key_expansion_128 1876.Ldec_key: 1877 sub $0x10, TKEYP 1878 movaps (KEYP), %xmm0 1879 movaps (TKEYP), %xmm1 1880 movaps %xmm0, 240(TKEYP) 1881 movaps %xmm1, 240(KEYP) 1882 add $0x10, KEYP 1883 lea 240-16(TKEYP), UKEYP 1884.align 4 1885.Ldec_key_loop: 1886 movaps (KEYP), %xmm0 1887 AESIMC %xmm0 %xmm1 1888 movaps %xmm1, (UKEYP) 1889 add $0x10, KEYP 1890 sub $0x10, UKEYP 1891 cmp TKEYP, KEYP 1892 jb .Ldec_key_loop 1893 xor AREG, AREG 1894#ifndef __x86_64__ 1895 popl KEYP 1896#endif 1897 ret 1898ENDPROC(aesni_set_key) 1899 1900/* 1901 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1902 */ 1903ENTRY(aesni_enc) 1904#ifndef __x86_64__ 1905 pushl KEYP 1906 pushl KLEN 1907 movl 12(%esp), KEYP 1908 movl 16(%esp), OUTP 1909 movl 20(%esp), INP 1910#endif 1911 movl 480(KEYP), KLEN # key length 1912 movups (INP), STATE # input 1913 call _aesni_enc1 1914 movups STATE, (OUTP) # output 1915#ifndef __x86_64__ 1916 popl KLEN 1917 popl KEYP 1918#endif 1919 ret 1920ENDPROC(aesni_enc) 1921 1922/* 1923 * _aesni_enc1: internal ABI 1924 * input: 1925 * KEYP: key struct pointer 1926 * KLEN: round count 1927 * STATE: initial state (input) 1928 * output: 1929 * STATE: finial state (output) 1930 * changed: 1931 * KEY 1932 * TKEYP (T1) 1933 */ 1934.align 4 1935_aesni_enc1: 1936 movaps (KEYP), KEY # key 1937 mov KEYP, TKEYP 1938 pxor KEY, STATE # round 0 1939 add $0x30, TKEYP 1940 cmp $24, KLEN 1941 jb .Lenc128 1942 lea 0x20(TKEYP), TKEYP 1943 je .Lenc192 1944 add $0x20, TKEYP 1945 movaps -0x60(TKEYP), KEY 1946 AESENC KEY STATE 1947 movaps -0x50(TKEYP), KEY 1948 AESENC KEY STATE 1949.align 4 1950.Lenc192: 1951 movaps -0x40(TKEYP), KEY 1952 AESENC KEY STATE 1953 movaps -0x30(TKEYP), KEY 1954 AESENC KEY STATE 1955.align 4 1956.Lenc128: 1957 movaps -0x20(TKEYP), KEY 1958 AESENC KEY STATE 1959 movaps -0x10(TKEYP), KEY 1960 AESENC KEY STATE 1961 movaps (TKEYP), KEY 1962 AESENC KEY STATE 1963 movaps 0x10(TKEYP), KEY 1964 AESENC KEY STATE 1965 movaps 0x20(TKEYP), KEY 1966 AESENC KEY STATE 1967 movaps 0x30(TKEYP), KEY 1968 AESENC KEY STATE 1969 movaps 0x40(TKEYP), KEY 1970 AESENC KEY STATE 1971 movaps 0x50(TKEYP), KEY 1972 AESENC KEY STATE 1973 movaps 0x60(TKEYP), KEY 1974 AESENC KEY STATE 1975 movaps 0x70(TKEYP), KEY 1976 AESENCLAST KEY STATE 1977 ret 1978ENDPROC(_aesni_enc1) 1979 1980/* 1981 * _aesni_enc4: internal ABI 1982 * input: 1983 * KEYP: key struct pointer 1984 * KLEN: round count 1985 * STATE1: initial state (input) 1986 * STATE2 1987 * STATE3 1988 * STATE4 1989 * output: 1990 * STATE1: finial state (output) 1991 * STATE2 1992 * STATE3 1993 * STATE4 1994 * changed: 1995 * KEY 1996 * TKEYP (T1) 1997 */ 1998.align 4 1999_aesni_enc4: 2000 movaps (KEYP), KEY # key 2001 mov KEYP, TKEYP 2002 pxor KEY, STATE1 # round 0 2003 pxor KEY, STATE2 2004 pxor KEY, STATE3 2005 pxor KEY, STATE4 2006 add $0x30, TKEYP 2007 cmp $24, KLEN 2008 jb .L4enc128 2009 lea 0x20(TKEYP), TKEYP 2010 je .L4enc192 2011 add $0x20, TKEYP 2012 movaps -0x60(TKEYP), KEY 2013 AESENC KEY STATE1 2014 AESENC KEY STATE2 2015 AESENC KEY STATE3 2016 AESENC KEY STATE4 2017 movaps -0x50(TKEYP), KEY 2018 AESENC KEY STATE1 2019 AESENC KEY STATE2 2020 AESENC KEY STATE3 2021 AESENC KEY STATE4 2022#.align 4 2023.L4enc192: 2024 movaps -0x40(TKEYP), KEY 2025 AESENC KEY STATE1 2026 AESENC KEY STATE2 2027 AESENC KEY STATE3 2028 AESENC KEY STATE4 2029 movaps -0x30(TKEYP), KEY 2030 AESENC KEY STATE1 2031 AESENC KEY STATE2 2032 AESENC KEY STATE3 2033 AESENC KEY STATE4 2034#.align 4 2035.L4enc128: 2036 movaps -0x20(TKEYP), KEY 2037 AESENC KEY STATE1 2038 AESENC KEY STATE2 2039 AESENC KEY STATE3 2040 AESENC KEY STATE4 2041 movaps -0x10(TKEYP), KEY 2042 AESENC KEY STATE1 2043 AESENC KEY STATE2 2044 AESENC KEY STATE3 2045 AESENC KEY STATE4 2046 movaps (TKEYP), KEY 2047 AESENC KEY STATE1 2048 AESENC KEY STATE2 2049 AESENC KEY STATE3 2050 AESENC KEY STATE4 2051 movaps 0x10(TKEYP), KEY 2052 AESENC KEY STATE1 2053 AESENC KEY STATE2 2054 AESENC KEY STATE3 2055 AESENC KEY STATE4 2056 movaps 0x20(TKEYP), KEY 2057 AESENC KEY STATE1 2058 AESENC KEY STATE2 2059 AESENC KEY STATE3 2060 AESENC KEY STATE4 2061 movaps 0x30(TKEYP), KEY 2062 AESENC KEY STATE1 2063 AESENC KEY STATE2 2064 AESENC KEY STATE3 2065 AESENC KEY STATE4 2066 movaps 0x40(TKEYP), KEY 2067 AESENC KEY STATE1 2068 AESENC KEY STATE2 2069 AESENC KEY STATE3 2070 AESENC KEY STATE4 2071 movaps 0x50(TKEYP), KEY 2072 AESENC KEY STATE1 2073 AESENC KEY STATE2 2074 AESENC KEY STATE3 2075 AESENC KEY STATE4 2076 movaps 0x60(TKEYP), KEY 2077 AESENC KEY STATE1 2078 AESENC KEY STATE2 2079 AESENC KEY STATE3 2080 AESENC KEY STATE4 2081 movaps 0x70(TKEYP), KEY 2082 AESENCLAST KEY STATE1 # last round 2083 AESENCLAST KEY STATE2 2084 AESENCLAST KEY STATE3 2085 AESENCLAST KEY STATE4 2086 ret 2087ENDPROC(_aesni_enc4) 2088 2089/* 2090 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2091 */ 2092ENTRY(aesni_dec) 2093#ifndef __x86_64__ 2094 pushl KEYP 2095 pushl KLEN 2096 movl 12(%esp), KEYP 2097 movl 16(%esp), OUTP 2098 movl 20(%esp), INP 2099#endif 2100 mov 480(KEYP), KLEN # key length 2101 add $240, KEYP 2102 movups (INP), STATE # input 2103 call _aesni_dec1 2104 movups STATE, (OUTP) #output 2105#ifndef __x86_64__ 2106 popl KLEN 2107 popl KEYP 2108#endif 2109 ret 2110ENDPROC(aesni_dec) 2111 2112/* 2113 * _aesni_dec1: internal ABI 2114 * input: 2115 * KEYP: key struct pointer 2116 * KLEN: key length 2117 * STATE: initial state (input) 2118 * output: 2119 * STATE: finial state (output) 2120 * changed: 2121 * KEY 2122 * TKEYP (T1) 2123 */ 2124.align 4 2125_aesni_dec1: 2126 movaps (KEYP), KEY # key 2127 mov KEYP, TKEYP 2128 pxor KEY, STATE # round 0 2129 add $0x30, TKEYP 2130 cmp $24, KLEN 2131 jb .Ldec128 2132 lea 0x20(TKEYP), TKEYP 2133 je .Ldec192 2134 add $0x20, TKEYP 2135 movaps -0x60(TKEYP), KEY 2136 AESDEC KEY STATE 2137 movaps -0x50(TKEYP), KEY 2138 AESDEC KEY STATE 2139.align 4 2140.Ldec192: 2141 movaps -0x40(TKEYP), KEY 2142 AESDEC KEY STATE 2143 movaps -0x30(TKEYP), KEY 2144 AESDEC KEY STATE 2145.align 4 2146.Ldec128: 2147 movaps -0x20(TKEYP), KEY 2148 AESDEC KEY STATE 2149 movaps -0x10(TKEYP), KEY 2150 AESDEC KEY STATE 2151 movaps (TKEYP), KEY 2152 AESDEC KEY STATE 2153 movaps 0x10(TKEYP), KEY 2154 AESDEC KEY STATE 2155 movaps 0x20(TKEYP), KEY 2156 AESDEC KEY STATE 2157 movaps 0x30(TKEYP), KEY 2158 AESDEC KEY STATE 2159 movaps 0x40(TKEYP), KEY 2160 AESDEC KEY STATE 2161 movaps 0x50(TKEYP), KEY 2162 AESDEC KEY STATE 2163 movaps 0x60(TKEYP), KEY 2164 AESDEC KEY STATE 2165 movaps 0x70(TKEYP), KEY 2166 AESDECLAST KEY STATE 2167 ret 2168ENDPROC(_aesni_dec1) 2169 2170/* 2171 * _aesni_dec4: internal ABI 2172 * input: 2173 * KEYP: key struct pointer 2174 * KLEN: key length 2175 * STATE1: initial state (input) 2176 * STATE2 2177 * STATE3 2178 * STATE4 2179 * output: 2180 * STATE1: finial state (output) 2181 * STATE2 2182 * STATE3 2183 * STATE4 2184 * changed: 2185 * KEY 2186 * TKEYP (T1) 2187 */ 2188.align 4 2189_aesni_dec4: 2190 movaps (KEYP), KEY # key 2191 mov KEYP, TKEYP 2192 pxor KEY, STATE1 # round 0 2193 pxor KEY, STATE2 2194 pxor KEY, STATE3 2195 pxor KEY, STATE4 2196 add $0x30, TKEYP 2197 cmp $24, KLEN 2198 jb .L4dec128 2199 lea 0x20(TKEYP), TKEYP 2200 je .L4dec192 2201 add $0x20, TKEYP 2202 movaps -0x60(TKEYP), KEY 2203 AESDEC KEY STATE1 2204 AESDEC KEY STATE2 2205 AESDEC KEY STATE3 2206 AESDEC KEY STATE4 2207 movaps -0x50(TKEYP), KEY 2208 AESDEC KEY STATE1 2209 AESDEC KEY STATE2 2210 AESDEC KEY STATE3 2211 AESDEC KEY STATE4 2212.align 4 2213.L4dec192: 2214 movaps -0x40(TKEYP), KEY 2215 AESDEC KEY STATE1 2216 AESDEC KEY STATE2 2217 AESDEC KEY STATE3 2218 AESDEC KEY STATE4 2219 movaps -0x30(TKEYP), KEY 2220 AESDEC KEY STATE1 2221 AESDEC KEY STATE2 2222 AESDEC KEY STATE3 2223 AESDEC KEY STATE4 2224.align 4 2225.L4dec128: 2226 movaps -0x20(TKEYP), KEY 2227 AESDEC KEY STATE1 2228 AESDEC KEY STATE2 2229 AESDEC KEY STATE3 2230 AESDEC KEY STATE4 2231 movaps -0x10(TKEYP), KEY 2232 AESDEC KEY STATE1 2233 AESDEC KEY STATE2 2234 AESDEC KEY STATE3 2235 AESDEC KEY STATE4 2236 movaps (TKEYP), KEY 2237 AESDEC KEY STATE1 2238 AESDEC KEY STATE2 2239 AESDEC KEY STATE3 2240 AESDEC KEY STATE4 2241 movaps 0x10(TKEYP), KEY 2242 AESDEC KEY STATE1 2243 AESDEC KEY STATE2 2244 AESDEC KEY STATE3 2245 AESDEC KEY STATE4 2246 movaps 0x20(TKEYP), KEY 2247 AESDEC KEY STATE1 2248 AESDEC KEY STATE2 2249 AESDEC KEY STATE3 2250 AESDEC KEY STATE4 2251 movaps 0x30(TKEYP), KEY 2252 AESDEC KEY STATE1 2253 AESDEC KEY STATE2 2254 AESDEC KEY STATE3 2255 AESDEC KEY STATE4 2256 movaps 0x40(TKEYP), KEY 2257 AESDEC KEY STATE1 2258 AESDEC KEY STATE2 2259 AESDEC KEY STATE3 2260 AESDEC KEY STATE4 2261 movaps 0x50(TKEYP), KEY 2262 AESDEC KEY STATE1 2263 AESDEC KEY STATE2 2264 AESDEC KEY STATE3 2265 AESDEC KEY STATE4 2266 movaps 0x60(TKEYP), KEY 2267 AESDEC KEY STATE1 2268 AESDEC KEY STATE2 2269 AESDEC KEY STATE3 2270 AESDEC KEY STATE4 2271 movaps 0x70(TKEYP), KEY 2272 AESDECLAST KEY STATE1 # last round 2273 AESDECLAST KEY STATE2 2274 AESDECLAST KEY STATE3 2275 AESDECLAST KEY STATE4 2276 ret 2277ENDPROC(_aesni_dec4) 2278 2279/* 2280 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2281 * size_t len) 2282 */ 2283ENTRY(aesni_ecb_enc) 2284#ifndef __x86_64__ 2285 pushl LEN 2286 pushl KEYP 2287 pushl KLEN 2288 movl 16(%esp), KEYP 2289 movl 20(%esp), OUTP 2290 movl 24(%esp), INP 2291 movl 28(%esp), LEN 2292#endif 2293 test LEN, LEN # check length 2294 jz .Lecb_enc_ret 2295 mov 480(KEYP), KLEN 2296 cmp $16, LEN 2297 jb .Lecb_enc_ret 2298 cmp $64, LEN 2299 jb .Lecb_enc_loop1 2300.align 4 2301.Lecb_enc_loop4: 2302 movups (INP), STATE1 2303 movups 0x10(INP), STATE2 2304 movups 0x20(INP), STATE3 2305 movups 0x30(INP), STATE4 2306 call _aesni_enc4 2307 movups STATE1, (OUTP) 2308 movups STATE2, 0x10(OUTP) 2309 movups STATE3, 0x20(OUTP) 2310 movups STATE4, 0x30(OUTP) 2311 sub $64, LEN 2312 add $64, INP 2313 add $64, OUTP 2314 cmp $64, LEN 2315 jge .Lecb_enc_loop4 2316 cmp $16, LEN 2317 jb .Lecb_enc_ret 2318.align 4 2319.Lecb_enc_loop1: 2320 movups (INP), STATE1 2321 call _aesni_enc1 2322 movups STATE1, (OUTP) 2323 sub $16, LEN 2324 add $16, INP 2325 add $16, OUTP 2326 cmp $16, LEN 2327 jge .Lecb_enc_loop1 2328.Lecb_enc_ret: 2329#ifndef __x86_64__ 2330 popl KLEN 2331 popl KEYP 2332 popl LEN 2333#endif 2334 ret 2335ENDPROC(aesni_ecb_enc) 2336 2337/* 2338 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2339 * size_t len); 2340 */ 2341ENTRY(aesni_ecb_dec) 2342#ifndef __x86_64__ 2343 pushl LEN 2344 pushl KEYP 2345 pushl KLEN 2346 movl 16(%esp), KEYP 2347 movl 20(%esp), OUTP 2348 movl 24(%esp), INP 2349 movl 28(%esp), LEN 2350#endif 2351 test LEN, LEN 2352 jz .Lecb_dec_ret 2353 mov 480(KEYP), KLEN 2354 add $240, KEYP 2355 cmp $16, LEN 2356 jb .Lecb_dec_ret 2357 cmp $64, LEN 2358 jb .Lecb_dec_loop1 2359.align 4 2360.Lecb_dec_loop4: 2361 movups (INP), STATE1 2362 movups 0x10(INP), STATE2 2363 movups 0x20(INP), STATE3 2364 movups 0x30(INP), STATE4 2365 call _aesni_dec4 2366 movups STATE1, (OUTP) 2367 movups STATE2, 0x10(OUTP) 2368 movups STATE3, 0x20(OUTP) 2369 movups STATE4, 0x30(OUTP) 2370 sub $64, LEN 2371 add $64, INP 2372 add $64, OUTP 2373 cmp $64, LEN 2374 jge .Lecb_dec_loop4 2375 cmp $16, LEN 2376 jb .Lecb_dec_ret 2377.align 4 2378.Lecb_dec_loop1: 2379 movups (INP), STATE1 2380 call _aesni_dec1 2381 movups STATE1, (OUTP) 2382 sub $16, LEN 2383 add $16, INP 2384 add $16, OUTP 2385 cmp $16, LEN 2386 jge .Lecb_dec_loop1 2387.Lecb_dec_ret: 2388#ifndef __x86_64__ 2389 popl KLEN 2390 popl KEYP 2391 popl LEN 2392#endif 2393 ret 2394ENDPROC(aesni_ecb_dec) 2395 2396/* 2397 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2398 * size_t len, u8 *iv) 2399 */ 2400ENTRY(aesni_cbc_enc) 2401#ifndef __x86_64__ 2402 pushl IVP 2403 pushl LEN 2404 pushl KEYP 2405 pushl KLEN 2406 movl 20(%esp), KEYP 2407 movl 24(%esp), OUTP 2408 movl 28(%esp), INP 2409 movl 32(%esp), LEN 2410 movl 36(%esp), IVP 2411#endif 2412 cmp $16, LEN 2413 jb .Lcbc_enc_ret 2414 mov 480(KEYP), KLEN 2415 movups (IVP), STATE # load iv as initial state 2416.align 4 2417.Lcbc_enc_loop: 2418 movups (INP), IN # load input 2419 pxor IN, STATE 2420 call _aesni_enc1 2421 movups STATE, (OUTP) # store output 2422 sub $16, LEN 2423 add $16, INP 2424 add $16, OUTP 2425 cmp $16, LEN 2426 jge .Lcbc_enc_loop 2427 movups STATE, (IVP) 2428.Lcbc_enc_ret: 2429#ifndef __x86_64__ 2430 popl KLEN 2431 popl KEYP 2432 popl LEN 2433 popl IVP 2434#endif 2435 ret 2436ENDPROC(aesni_cbc_enc) 2437 2438/* 2439 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2440 * size_t len, u8 *iv) 2441 */ 2442ENTRY(aesni_cbc_dec) 2443#ifndef __x86_64__ 2444 pushl IVP 2445 pushl LEN 2446 pushl KEYP 2447 pushl KLEN 2448 movl 20(%esp), KEYP 2449 movl 24(%esp), OUTP 2450 movl 28(%esp), INP 2451 movl 32(%esp), LEN 2452 movl 36(%esp), IVP 2453#endif 2454 cmp $16, LEN 2455 jb .Lcbc_dec_just_ret 2456 mov 480(KEYP), KLEN 2457 add $240, KEYP 2458 movups (IVP), IV 2459 cmp $64, LEN 2460 jb .Lcbc_dec_loop1 2461.align 4 2462.Lcbc_dec_loop4: 2463 movups (INP), IN1 2464 movaps IN1, STATE1 2465 movups 0x10(INP), IN2 2466 movaps IN2, STATE2 2467#ifdef __x86_64__ 2468 movups 0x20(INP), IN3 2469 movaps IN3, STATE3 2470 movups 0x30(INP), IN4 2471 movaps IN4, STATE4 2472#else 2473 movups 0x20(INP), IN1 2474 movaps IN1, STATE3 2475 movups 0x30(INP), IN2 2476 movaps IN2, STATE4 2477#endif 2478 call _aesni_dec4 2479 pxor IV, STATE1 2480#ifdef __x86_64__ 2481 pxor IN1, STATE2 2482 pxor IN2, STATE3 2483 pxor IN3, STATE4 2484 movaps IN4, IV 2485#else 2486 pxor IN1, STATE4 2487 movaps IN2, IV 2488 movups (INP), IN1 2489 pxor IN1, STATE2 2490 movups 0x10(INP), IN2 2491 pxor IN2, STATE3 2492#endif 2493 movups STATE1, (OUTP) 2494 movups STATE2, 0x10(OUTP) 2495 movups STATE3, 0x20(OUTP) 2496 movups STATE4, 0x30(OUTP) 2497 sub $64, LEN 2498 add $64, INP 2499 add $64, OUTP 2500 cmp $64, LEN 2501 jge .Lcbc_dec_loop4 2502 cmp $16, LEN 2503 jb .Lcbc_dec_ret 2504.align 4 2505.Lcbc_dec_loop1: 2506 movups (INP), IN 2507 movaps IN, STATE 2508 call _aesni_dec1 2509 pxor IV, STATE 2510 movups STATE, (OUTP) 2511 movaps IN, IV 2512 sub $16, LEN 2513 add $16, INP 2514 add $16, OUTP 2515 cmp $16, LEN 2516 jge .Lcbc_dec_loop1 2517.Lcbc_dec_ret: 2518 movups IV, (IVP) 2519.Lcbc_dec_just_ret: 2520#ifndef __x86_64__ 2521 popl KLEN 2522 popl KEYP 2523 popl LEN 2524 popl IVP 2525#endif 2526 ret 2527ENDPROC(aesni_cbc_dec) 2528 2529#ifdef __x86_64__ 2530.align 16 2531.Lbswap_mask: 2532 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2533 2534/* 2535 * _aesni_inc_init: internal ABI 2536 * setup registers used by _aesni_inc 2537 * input: 2538 * IV 2539 * output: 2540 * CTR: == IV, in little endian 2541 * TCTR_LOW: == lower qword of CTR 2542 * INC: == 1, in little endian 2543 * BSWAP_MASK == endian swapping mask 2544 */ 2545.align 4 2546_aesni_inc_init: 2547 movaps .Lbswap_mask, BSWAP_MASK 2548 movaps IV, CTR 2549 PSHUFB_XMM BSWAP_MASK CTR 2550 mov $1, TCTR_LOW 2551 MOVQ_R64_XMM TCTR_LOW INC 2552 MOVQ_R64_XMM CTR TCTR_LOW 2553 ret 2554ENDPROC(_aesni_inc_init) 2555 2556/* 2557 * _aesni_inc: internal ABI 2558 * Increase IV by 1, IV is in big endian 2559 * input: 2560 * IV 2561 * CTR: == IV, in little endian 2562 * TCTR_LOW: == lower qword of CTR 2563 * INC: == 1, in little endian 2564 * BSWAP_MASK == endian swapping mask 2565 * output: 2566 * IV: Increase by 1 2567 * changed: 2568 * CTR: == output IV, in little endian 2569 * TCTR_LOW: == lower qword of CTR 2570 */ 2571.align 4 2572_aesni_inc: 2573 paddq INC, CTR 2574 add $1, TCTR_LOW 2575 jnc .Linc_low 2576 pslldq $8, INC 2577 paddq INC, CTR 2578 psrldq $8, INC 2579.Linc_low: 2580 movaps CTR, IV 2581 PSHUFB_XMM BSWAP_MASK IV 2582 ret 2583ENDPROC(_aesni_inc) 2584 2585/* 2586 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2587 * size_t len, u8 *iv) 2588 */ 2589ENTRY(aesni_ctr_enc) 2590 cmp $16, LEN 2591 jb .Lctr_enc_just_ret 2592 mov 480(KEYP), KLEN 2593 movups (IVP), IV 2594 call _aesni_inc_init 2595 cmp $64, LEN 2596 jb .Lctr_enc_loop1 2597.align 4 2598.Lctr_enc_loop4: 2599 movaps IV, STATE1 2600 call _aesni_inc 2601 movups (INP), IN1 2602 movaps IV, STATE2 2603 call _aesni_inc 2604 movups 0x10(INP), IN2 2605 movaps IV, STATE3 2606 call _aesni_inc 2607 movups 0x20(INP), IN3 2608 movaps IV, STATE4 2609 call _aesni_inc 2610 movups 0x30(INP), IN4 2611 call _aesni_enc4 2612 pxor IN1, STATE1 2613 movups STATE1, (OUTP) 2614 pxor IN2, STATE2 2615 movups STATE2, 0x10(OUTP) 2616 pxor IN3, STATE3 2617 movups STATE3, 0x20(OUTP) 2618 pxor IN4, STATE4 2619 movups STATE4, 0x30(OUTP) 2620 sub $64, LEN 2621 add $64, INP 2622 add $64, OUTP 2623 cmp $64, LEN 2624 jge .Lctr_enc_loop4 2625 cmp $16, LEN 2626 jb .Lctr_enc_ret 2627.align 4 2628.Lctr_enc_loop1: 2629 movaps IV, STATE 2630 call _aesni_inc 2631 movups (INP), IN 2632 call _aesni_enc1 2633 pxor IN, STATE 2634 movups STATE, (OUTP) 2635 sub $16, LEN 2636 add $16, INP 2637 add $16, OUTP 2638 cmp $16, LEN 2639 jge .Lctr_enc_loop1 2640.Lctr_enc_ret: 2641 movups IV, (IVP) 2642.Lctr_enc_just_ret: 2643 ret 2644ENDPROC(aesni_ctr_enc) 2645 2646/* 2647 * _aesni_gf128mul_x_ble: internal ABI 2648 * Multiply in GF(2^128) for XTS IVs 2649 * input: 2650 * IV: current IV 2651 * GF128MUL_MASK == mask with 0x87 and 0x01 2652 * output: 2653 * IV: next IV 2654 * changed: 2655 * CTR: == temporary value 2656 */ 2657#define _aesni_gf128mul_x_ble() \ 2658 pshufd $0x13, IV, CTR; \ 2659 paddq IV, IV; \ 2660 psrad $31, CTR; \ 2661 pand GF128MUL_MASK, CTR; \ 2662 pxor CTR, IV; 2663 2664/* 2665 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2666 * bool enc, u8 *iv) 2667 */ 2668ENTRY(aesni_xts_crypt8) 2669 cmpb $0, %cl 2670 movl $0, %ecx 2671 movl $240, %r10d 2672 leaq _aesni_enc4, %r11 2673 leaq _aesni_dec4, %rax 2674 cmovel %r10d, %ecx 2675 cmoveq %rax, %r11 2676 2677 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2678 movups (IVP), IV 2679 2680 mov 480(KEYP), KLEN 2681 addq %rcx, KEYP 2682 2683 movdqa IV, STATE1 2684 movdqu 0x00(INP), INC 2685 pxor INC, STATE1 2686 movdqu IV, 0x00(OUTP) 2687 2688 _aesni_gf128mul_x_ble() 2689 movdqa IV, STATE2 2690 movdqu 0x10(INP), INC 2691 pxor INC, STATE2 2692 movdqu IV, 0x10(OUTP) 2693 2694 _aesni_gf128mul_x_ble() 2695 movdqa IV, STATE3 2696 movdqu 0x20(INP), INC 2697 pxor INC, STATE3 2698 movdqu IV, 0x20(OUTP) 2699 2700 _aesni_gf128mul_x_ble() 2701 movdqa IV, STATE4 2702 movdqu 0x30(INP), INC 2703 pxor INC, STATE4 2704 movdqu IV, 0x30(OUTP) 2705 2706 call *%r11 2707 2708 movdqu 0x00(OUTP), INC 2709 pxor INC, STATE1 2710 movdqu STATE1, 0x00(OUTP) 2711 2712 _aesni_gf128mul_x_ble() 2713 movdqa IV, STATE1 2714 movdqu 0x40(INP), INC 2715 pxor INC, STATE1 2716 movdqu IV, 0x40(OUTP) 2717 2718 movdqu 0x10(OUTP), INC 2719 pxor INC, STATE2 2720 movdqu STATE2, 0x10(OUTP) 2721 2722 _aesni_gf128mul_x_ble() 2723 movdqa IV, STATE2 2724 movdqu 0x50(INP), INC 2725 pxor INC, STATE2 2726 movdqu IV, 0x50(OUTP) 2727 2728 movdqu 0x20(OUTP), INC 2729 pxor INC, STATE3 2730 movdqu STATE3, 0x20(OUTP) 2731 2732 _aesni_gf128mul_x_ble() 2733 movdqa IV, STATE3 2734 movdqu 0x60(INP), INC 2735 pxor INC, STATE3 2736 movdqu IV, 0x60(OUTP) 2737 2738 movdqu 0x30(OUTP), INC 2739 pxor INC, STATE4 2740 movdqu STATE4, 0x30(OUTP) 2741 2742 _aesni_gf128mul_x_ble() 2743 movdqa IV, STATE4 2744 movdqu 0x70(INP), INC 2745 pxor INC, STATE4 2746 movdqu IV, 0x70(OUTP) 2747 2748 _aesni_gf128mul_x_ble() 2749 movups IV, (IVP) 2750 2751 call *%r11 2752 2753 movdqu 0x40(OUTP), INC 2754 pxor INC, STATE1 2755 movdqu STATE1, 0x40(OUTP) 2756 2757 movdqu 0x50(OUTP), INC 2758 pxor INC, STATE2 2759 movdqu STATE2, 0x50(OUTP) 2760 2761 movdqu 0x60(OUTP), INC 2762 pxor INC, STATE3 2763 movdqu STATE3, 0x60(OUTP) 2764 2765 movdqu 0x70(OUTP), INC 2766 pxor INC, STATE4 2767 movdqu STATE4, 0x70(OUTP) 2768 2769 ret 2770ENDPROC(aesni_xts_crypt8) 2771 2772#endif 2773