1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/frame.h> 35 36/* 37 * The following macros are used to move an (un)aligned 16 byte value to/from 38 * an XMM register. This can done for either FP or integer values, for FP use 39 * movaps (move aligned packed single) or integer use movdqa (move double quad 40 * aligned). It doesn't make a performance difference which instruction is used 41 * since Nehalem (original Core i7) was released. However, the movaps is a byte 42 * shorter, so that is the one we'll use for now. (same for unaligned). 43 */ 44#define MOVADQ movaps 45#define MOVUDQ movups 46 47#ifdef __x86_64__ 48 49.data 50.align 16 51.Lgf128mul_x_ble_mask: 52 .octa 0x00000000000000010000000000000087 53POLY: .octa 0xC2000000000000000000000000000001 54TWOONE: .octa 0x00000001000000000000000000000001 55 56# order of these constants should not change. 57# more specifically, ALL_F should follow SHIFT_MASK, 58# and ZERO should follow ALL_F 59 60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 61MASK1: .octa 0x0000000000000000ffffffffffffffff 62MASK2: .octa 0xffffffffffffffff0000000000000000 63SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 64ALL_F: .octa 0xffffffffffffffffffffffffffffffff 65ZERO: .octa 0x00000000000000000000000000000000 66ONE: .octa 0x00000000000000000000000000000001 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 68dec: .octa 0x1 69enc: .octa 0x2 70 71 72.text 73 74 75#define STACK_OFFSET 8*3 76#define HashKey 16*0 // store HashKey <<1 mod poly here 77#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 78#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 79#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 80#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 81 // bits of HashKey <<1 mod poly here 82 //(for Karatsuba purposes) 83#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 84 // bits of HashKey^2 <<1 mod poly here 85 // (for Karatsuba purposes) 86#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 87 // bits of HashKey^3 <<1 mod poly here 88 // (for Karatsuba purposes) 89#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 90 // bits of HashKey^4 <<1 mod poly here 91 // (for Karatsuba purposes) 92#define VARIABLE_OFFSET 16*8 93 94#define arg1 rdi 95#define arg2 rsi 96#define arg3 rdx 97#define arg4 rcx 98#define arg5 r8 99#define arg6 r9 100#define arg7 STACK_OFFSET+8(%r14) 101#define arg8 STACK_OFFSET+16(%r14) 102#define arg9 STACK_OFFSET+24(%r14) 103#define arg10 STACK_OFFSET+32(%r14) 104#define keysize 2*15*16(%arg1) 105#endif 106 107 108#define STATE1 %xmm0 109#define STATE2 %xmm4 110#define STATE3 %xmm5 111#define STATE4 %xmm6 112#define STATE STATE1 113#define IN1 %xmm1 114#define IN2 %xmm7 115#define IN3 %xmm8 116#define IN4 %xmm9 117#define IN IN1 118#define KEY %xmm2 119#define IV %xmm3 120 121#define BSWAP_MASK %xmm10 122#define CTR %xmm11 123#define INC %xmm12 124 125#define GF128MUL_MASK %xmm10 126 127#ifdef __x86_64__ 128#define AREG %rax 129#define KEYP %rdi 130#define OUTP %rsi 131#define UKEYP OUTP 132#define INP %rdx 133#define LEN %rcx 134#define IVP %r8 135#define KLEN %r9d 136#define T1 %r10 137#define TKEYP T1 138#define T2 %r11 139#define TCTR_LOW T2 140#else 141#define AREG %eax 142#define KEYP %edi 143#define OUTP AREG 144#define UKEYP OUTP 145#define INP %edx 146#define LEN %esi 147#define IVP %ebp 148#define KLEN %ebx 149#define T1 %ecx 150#define TKEYP T1 151#endif 152 153 154#ifdef __x86_64__ 155/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 156* 157* 158* Input: A and B (128-bits each, bit-reflected) 159* Output: C = A*B*x mod poly, (i.e. >>1 ) 160* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 161* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 162* 163*/ 164.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 165 movdqa \GH, \TMP1 166 pshufd $78, \GH, \TMP2 167 pshufd $78, \HK, \TMP3 168 pxor \GH, \TMP2 # TMP2 = a1+a0 169 pxor \HK, \TMP3 # TMP3 = b1+b0 170 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 171 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 172 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 173 pxor \GH, \TMP2 174 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 175 movdqa \TMP2, \TMP3 176 pslldq $8, \TMP3 # left shift TMP3 2 DWs 177 psrldq $8, \TMP2 # right shift TMP2 2 DWs 178 pxor \TMP3, \GH 179 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 180 181 # first phase of the reduction 182 183 movdqa \GH, \TMP2 184 movdqa \GH, \TMP3 185 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 186 # in in order to perform 187 # independent shifts 188 pslld $31, \TMP2 # packed right shift <<31 189 pslld $30, \TMP3 # packed right shift <<30 190 pslld $25, \TMP4 # packed right shift <<25 191 pxor \TMP3, \TMP2 # xor the shifted versions 192 pxor \TMP4, \TMP2 193 movdqa \TMP2, \TMP5 194 psrldq $4, \TMP5 # right shift TMP5 1 DW 195 pslldq $12, \TMP2 # left shift TMP2 3 DWs 196 pxor \TMP2, \GH 197 198 # second phase of the reduction 199 200 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 201 # in in order to perform 202 # independent shifts 203 movdqa \GH,\TMP3 204 movdqa \GH,\TMP4 205 psrld $1,\TMP2 # packed left shift >>1 206 psrld $2,\TMP3 # packed left shift >>2 207 psrld $7,\TMP4 # packed left shift >>7 208 pxor \TMP3,\TMP2 # xor the shifted versions 209 pxor \TMP4,\TMP2 210 pxor \TMP5, \TMP2 211 pxor \TMP2, \GH 212 pxor \TMP1, \GH # result is in TMP1 213.endm 214 215/* 216* if a = number of total plaintext bytes 217* b = floor(a/16) 218* num_initial_blocks = b mod 4 219* encrypt the initial num_initial_blocks blocks and apply ghash on 220* the ciphertext 221* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 222* are clobbered 223* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 224*/ 225 226 227.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 228XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 229 MOVADQ SHUF_MASK(%rip), %xmm14 230 mov arg7, %r10 # %r10 = AAD 231 mov arg8, %r12 # %r12 = aadLen 232 mov %r12, %r11 233 pxor %xmm\i, %xmm\i 234 235_get_AAD_loop\num_initial_blocks\operation: 236 movd (%r10), \TMP1 237 pslldq $12, \TMP1 238 psrldq $4, %xmm\i 239 pxor \TMP1, %xmm\i 240 add $4, %r10 241 sub $4, %r12 242 jne _get_AAD_loop\num_initial_blocks\operation 243 244 cmp $16, %r11 245 je _get_AAD_loop2_done\num_initial_blocks\operation 246 247 mov $16, %r12 248_get_AAD_loop2\num_initial_blocks\operation: 249 psrldq $4, %xmm\i 250 sub $4, %r12 251 cmp %r11, %r12 252 jne _get_AAD_loop2\num_initial_blocks\operation 253 254_get_AAD_loop2_done\num_initial_blocks\operation: 255 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 256 257 xor %r11, %r11 # initialise the data pointer offset as zero 258 259 # start AES for num_initial_blocks blocks 260 261 mov %arg5, %rax # %rax = *Y0 262 movdqu (%rax), \XMM0 # XMM0 = Y0 263 PSHUFB_XMM %xmm14, \XMM0 264 265.if (\i == 5) || (\i == 6) || (\i == 7) 266 MOVADQ ONE(%RIP),\TMP1 267 MOVADQ (%arg1),\TMP2 268.irpc index, \i_seq 269 paddd \TMP1, \XMM0 # INCR Y0 270 movdqa \XMM0, %xmm\index 271 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 272 pxor \TMP2, %xmm\index 273.endr 274 lea 0x10(%arg1),%r10 275 mov keysize,%eax 276 shr $2,%eax # 128->4, 192->6, 256->8 277 add $5,%eax # 128->9, 192->11, 256->13 278 279aes_loop_initial_dec\num_initial_blocks: 280 MOVADQ (%r10),\TMP1 281.irpc index, \i_seq 282 AESENC \TMP1, %xmm\index 283.endr 284 add $16,%r10 285 sub $1,%eax 286 jnz aes_loop_initial_dec\num_initial_blocks 287 288 MOVADQ (%r10), \TMP1 289.irpc index, \i_seq 290 AESENCLAST \TMP1, %xmm\index # Last Round 291.endr 292.irpc index, \i_seq 293 movdqu (%arg3 , %r11, 1), \TMP1 294 pxor \TMP1, %xmm\index 295 movdqu %xmm\index, (%arg2 , %r11, 1) 296 # write back plaintext/ciphertext for num_initial_blocks 297 add $16, %r11 298 299 movdqa \TMP1, %xmm\index 300 PSHUFB_XMM %xmm14, %xmm\index 301 # prepare plaintext/ciphertext for GHASH computation 302.endr 303.endif 304 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 305 # apply GHASH on num_initial_blocks blocks 306 307.if \i == 5 308 pxor %xmm5, %xmm6 309 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 310 pxor %xmm6, %xmm7 311 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 312 pxor %xmm7, %xmm8 313 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 314.elseif \i == 6 315 pxor %xmm6, %xmm7 316 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 317 pxor %xmm7, %xmm8 318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 319.elseif \i == 7 320 pxor %xmm7, %xmm8 321 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 322.endif 323 cmp $64, %r13 324 jl _initial_blocks_done\num_initial_blocks\operation 325 # no need for precomputed values 326/* 327* 328* Precomputations for HashKey parallel with encryption of first 4 blocks. 329* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 330*/ 331 MOVADQ ONE(%rip), \TMP1 332 paddd \TMP1, \XMM0 # INCR Y0 333 MOVADQ \XMM0, \XMM1 334 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 335 336 paddd \TMP1, \XMM0 # INCR Y0 337 MOVADQ \XMM0, \XMM2 338 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 339 340 paddd \TMP1, \XMM0 # INCR Y0 341 MOVADQ \XMM0, \XMM3 342 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 343 344 paddd \TMP1, \XMM0 # INCR Y0 345 MOVADQ \XMM0, \XMM4 346 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 347 348 MOVADQ 0(%arg1),\TMP1 349 pxor \TMP1, \XMM1 350 pxor \TMP1, \XMM2 351 pxor \TMP1, \XMM3 352 pxor \TMP1, \XMM4 353 movdqa \TMP3, \TMP5 354 pshufd $78, \TMP3, \TMP1 355 pxor \TMP3, \TMP1 356 movdqa \TMP1, HashKey_k(%rsp) 357 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 358# TMP5 = HashKey^2<<1 (mod poly) 359 movdqa \TMP5, HashKey_2(%rsp) 360# HashKey_2 = HashKey^2<<1 (mod poly) 361 pshufd $78, \TMP5, \TMP1 362 pxor \TMP5, \TMP1 363 movdqa \TMP1, HashKey_2_k(%rsp) 364.irpc index, 1234 # do 4 rounds 365 movaps 0x10*\index(%arg1), \TMP1 366 AESENC \TMP1, \XMM1 367 AESENC \TMP1, \XMM2 368 AESENC \TMP1, \XMM3 369 AESENC \TMP1, \XMM4 370.endr 371 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 372# TMP5 = HashKey^3<<1 (mod poly) 373 movdqa \TMP5, HashKey_3(%rsp) 374 pshufd $78, \TMP5, \TMP1 375 pxor \TMP5, \TMP1 376 movdqa \TMP1, HashKey_3_k(%rsp) 377.irpc index, 56789 # do next 5 rounds 378 movaps 0x10*\index(%arg1), \TMP1 379 AESENC \TMP1, \XMM1 380 AESENC \TMP1, \XMM2 381 AESENC \TMP1, \XMM3 382 AESENC \TMP1, \XMM4 383.endr 384 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 385# TMP5 = HashKey^3<<1 (mod poly) 386 movdqa \TMP5, HashKey_4(%rsp) 387 pshufd $78, \TMP5, \TMP1 388 pxor \TMP5, \TMP1 389 movdqa \TMP1, HashKey_4_k(%rsp) 390 lea 0xa0(%arg1),%r10 391 mov keysize,%eax 392 shr $2,%eax # 128->4, 192->6, 256->8 393 sub $4,%eax # 128->0, 192->2, 256->4 394 jz aes_loop_pre_dec_done\num_initial_blocks 395 396aes_loop_pre_dec\num_initial_blocks: 397 MOVADQ (%r10),\TMP2 398.irpc index, 1234 399 AESENC \TMP2, %xmm\index 400.endr 401 add $16,%r10 402 sub $1,%eax 403 jnz aes_loop_pre_dec\num_initial_blocks 404 405aes_loop_pre_dec_done\num_initial_blocks: 406 MOVADQ (%r10), \TMP2 407 AESENCLAST \TMP2, \XMM1 408 AESENCLAST \TMP2, \XMM2 409 AESENCLAST \TMP2, \XMM3 410 AESENCLAST \TMP2, \XMM4 411 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 412 pxor \TMP1, \XMM1 413 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 414 movdqa \TMP1, \XMM1 415 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 416 pxor \TMP1, \XMM2 417 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 418 movdqa \TMP1, \XMM2 419 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 420 pxor \TMP1, \XMM3 421 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 422 movdqa \TMP1, \XMM3 423 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 424 pxor \TMP1, \XMM4 425 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 426 movdqa \TMP1, \XMM4 427 add $64, %r11 428 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 429 pxor \XMMDst, \XMM1 430# combine GHASHed value with the corresponding ciphertext 431 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 432 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 434 435_initial_blocks_done\num_initial_blocks\operation: 436 437.endm 438 439 440/* 441* if a = number of total plaintext bytes 442* b = floor(a/16) 443* num_initial_blocks = b mod 4 444* encrypt the initial num_initial_blocks blocks and apply ghash on 445* the ciphertext 446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 447* are clobbered 448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 449*/ 450 451 452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 454 MOVADQ SHUF_MASK(%rip), %xmm14 455 mov arg7, %r10 # %r10 = AAD 456 mov arg8, %r12 # %r12 = aadLen 457 mov %r12, %r11 458 pxor %xmm\i, %xmm\i 459_get_AAD_loop\num_initial_blocks\operation: 460 movd (%r10), \TMP1 461 pslldq $12, \TMP1 462 psrldq $4, %xmm\i 463 pxor \TMP1, %xmm\i 464 add $4, %r10 465 sub $4, %r12 466 jne _get_AAD_loop\num_initial_blocks\operation 467 cmp $16, %r11 468 je _get_AAD_loop2_done\num_initial_blocks\operation 469 mov $16, %r12 470_get_AAD_loop2\num_initial_blocks\operation: 471 psrldq $4, %xmm\i 472 sub $4, %r12 473 cmp %r11, %r12 474 jne _get_AAD_loop2\num_initial_blocks\operation 475_get_AAD_loop2_done\num_initial_blocks\operation: 476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 477 478 xor %r11, %r11 # initialise the data pointer offset as zero 479 480 # start AES for num_initial_blocks blocks 481 482 mov %arg5, %rax # %rax = *Y0 483 movdqu (%rax), \XMM0 # XMM0 = Y0 484 PSHUFB_XMM %xmm14, \XMM0 485 486.if (\i == 5) || (\i == 6) || (\i == 7) 487 488 MOVADQ ONE(%RIP),\TMP1 489 MOVADQ 0(%arg1),\TMP2 490.irpc index, \i_seq 491 paddd \TMP1, \XMM0 # INCR Y0 492 MOVADQ \XMM0, %xmm\index 493 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 494 pxor \TMP2, %xmm\index 495.endr 496 lea 0x10(%arg1),%r10 497 mov keysize,%eax 498 shr $2,%eax # 128->4, 192->6, 256->8 499 add $5,%eax # 128->9, 192->11, 256->13 500 501aes_loop_initial_enc\num_initial_blocks: 502 MOVADQ (%r10),\TMP1 503.irpc index, \i_seq 504 AESENC \TMP1, %xmm\index 505.endr 506 add $16,%r10 507 sub $1,%eax 508 jnz aes_loop_initial_enc\num_initial_blocks 509 510 MOVADQ (%r10), \TMP1 511.irpc index, \i_seq 512 AESENCLAST \TMP1, %xmm\index # Last Round 513.endr 514.irpc index, \i_seq 515 movdqu (%arg3 , %r11, 1), \TMP1 516 pxor \TMP1, %xmm\index 517 movdqu %xmm\index, (%arg2 , %r11, 1) 518 # write back plaintext/ciphertext for num_initial_blocks 519 add $16, %r11 520 PSHUFB_XMM %xmm14, %xmm\index 521 522 # prepare plaintext/ciphertext for GHASH computation 523.endr 524.endif 525 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 526 # apply GHASH on num_initial_blocks blocks 527 528.if \i == 5 529 pxor %xmm5, %xmm6 530 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 531 pxor %xmm6, %xmm7 532 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 533 pxor %xmm7, %xmm8 534 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 535.elseif \i == 6 536 pxor %xmm6, %xmm7 537 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 538 pxor %xmm7, %xmm8 539 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 540.elseif \i == 7 541 pxor %xmm7, %xmm8 542 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 543.endif 544 cmp $64, %r13 545 jl _initial_blocks_done\num_initial_blocks\operation 546 # no need for precomputed values 547/* 548* 549* Precomputations for HashKey parallel with encryption of first 4 blocks. 550* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 551*/ 552 MOVADQ ONE(%RIP),\TMP1 553 paddd \TMP1, \XMM0 # INCR Y0 554 MOVADQ \XMM0, \XMM1 555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 556 557 paddd \TMP1, \XMM0 # INCR Y0 558 MOVADQ \XMM0, \XMM2 559 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 560 561 paddd \TMP1, \XMM0 # INCR Y0 562 MOVADQ \XMM0, \XMM3 563 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 564 565 paddd \TMP1, \XMM0 # INCR Y0 566 MOVADQ \XMM0, \XMM4 567 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 568 569 MOVADQ 0(%arg1),\TMP1 570 pxor \TMP1, \XMM1 571 pxor \TMP1, \XMM2 572 pxor \TMP1, \XMM3 573 pxor \TMP1, \XMM4 574 movdqa \TMP3, \TMP5 575 pshufd $78, \TMP3, \TMP1 576 pxor \TMP3, \TMP1 577 movdqa \TMP1, HashKey_k(%rsp) 578 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 579# TMP5 = HashKey^2<<1 (mod poly) 580 movdqa \TMP5, HashKey_2(%rsp) 581# HashKey_2 = HashKey^2<<1 (mod poly) 582 pshufd $78, \TMP5, \TMP1 583 pxor \TMP5, \TMP1 584 movdqa \TMP1, HashKey_2_k(%rsp) 585.irpc index, 1234 # do 4 rounds 586 movaps 0x10*\index(%arg1), \TMP1 587 AESENC \TMP1, \XMM1 588 AESENC \TMP1, \XMM2 589 AESENC \TMP1, \XMM3 590 AESENC \TMP1, \XMM4 591.endr 592 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 593# TMP5 = HashKey^3<<1 (mod poly) 594 movdqa \TMP5, HashKey_3(%rsp) 595 pshufd $78, \TMP5, \TMP1 596 pxor \TMP5, \TMP1 597 movdqa \TMP1, HashKey_3_k(%rsp) 598.irpc index, 56789 # do next 5 rounds 599 movaps 0x10*\index(%arg1), \TMP1 600 AESENC \TMP1, \XMM1 601 AESENC \TMP1, \XMM2 602 AESENC \TMP1, \XMM3 603 AESENC \TMP1, \XMM4 604.endr 605 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 606# TMP5 = HashKey^3<<1 (mod poly) 607 movdqa \TMP5, HashKey_4(%rsp) 608 pshufd $78, \TMP5, \TMP1 609 pxor \TMP5, \TMP1 610 movdqa \TMP1, HashKey_4_k(%rsp) 611 lea 0xa0(%arg1),%r10 612 mov keysize,%eax 613 shr $2,%eax # 128->4, 192->6, 256->8 614 sub $4,%eax # 128->0, 192->2, 256->4 615 jz aes_loop_pre_enc_done\num_initial_blocks 616 617aes_loop_pre_enc\num_initial_blocks: 618 MOVADQ (%r10),\TMP2 619.irpc index, 1234 620 AESENC \TMP2, %xmm\index 621.endr 622 add $16,%r10 623 sub $1,%eax 624 jnz aes_loop_pre_enc\num_initial_blocks 625 626aes_loop_pre_enc_done\num_initial_blocks: 627 MOVADQ (%r10), \TMP2 628 AESENCLAST \TMP2, \XMM1 629 AESENCLAST \TMP2, \XMM2 630 AESENCLAST \TMP2, \XMM3 631 AESENCLAST \TMP2, \XMM4 632 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 633 pxor \TMP1, \XMM1 634 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 635 pxor \TMP1, \XMM2 636 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 637 pxor \TMP1, \XMM3 638 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 639 pxor \TMP1, \XMM4 640 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 641 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 642 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 643 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 644 645 add $64, %r11 646 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 647 pxor \XMMDst, \XMM1 648# combine GHASHed value with the corresponding ciphertext 649 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 650 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 651 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 652 653_initial_blocks_done\num_initial_blocks\operation: 654 655.endm 656 657/* 658* encrypt 4 blocks at a time 659* ghash the 4 previously encrypted ciphertext blocks 660* arg1, %arg2, %arg3 are used as pointers only, not modified 661* %r11 is the data offset value 662*/ 663.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 664TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 665 666 movdqa \XMM1, \XMM5 667 movdqa \XMM2, \XMM6 668 movdqa \XMM3, \XMM7 669 movdqa \XMM4, \XMM8 670 671 movdqa SHUF_MASK(%rip), %xmm15 672 # multiply TMP5 * HashKey using karatsuba 673 674 movdqa \XMM5, \TMP4 675 pshufd $78, \XMM5, \TMP6 676 pxor \XMM5, \TMP6 677 paddd ONE(%rip), \XMM0 # INCR CNT 678 movdqa HashKey_4(%rsp), \TMP5 679 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 680 movdqa \XMM0, \XMM1 681 paddd ONE(%rip), \XMM0 # INCR CNT 682 movdqa \XMM0, \XMM2 683 paddd ONE(%rip), \XMM0 # INCR CNT 684 movdqa \XMM0, \XMM3 685 paddd ONE(%rip), \XMM0 # INCR CNT 686 movdqa \XMM0, \XMM4 687 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 688 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 689 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 690 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 691 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 692 693 pxor (%arg1), \XMM1 694 pxor (%arg1), \XMM2 695 pxor (%arg1), \XMM3 696 pxor (%arg1), \XMM4 697 movdqa HashKey_4_k(%rsp), \TMP5 698 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 699 movaps 0x10(%arg1), \TMP1 700 AESENC \TMP1, \XMM1 # Round 1 701 AESENC \TMP1, \XMM2 702 AESENC \TMP1, \XMM3 703 AESENC \TMP1, \XMM4 704 movaps 0x20(%arg1), \TMP1 705 AESENC \TMP1, \XMM1 # Round 2 706 AESENC \TMP1, \XMM2 707 AESENC \TMP1, \XMM3 708 AESENC \TMP1, \XMM4 709 movdqa \XMM6, \TMP1 710 pshufd $78, \XMM6, \TMP2 711 pxor \XMM6, \TMP2 712 movdqa HashKey_3(%rsp), \TMP5 713 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 714 movaps 0x30(%arg1), \TMP3 715 AESENC \TMP3, \XMM1 # Round 3 716 AESENC \TMP3, \XMM2 717 AESENC \TMP3, \XMM3 718 AESENC \TMP3, \XMM4 719 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 720 movaps 0x40(%arg1), \TMP3 721 AESENC \TMP3, \XMM1 # Round 4 722 AESENC \TMP3, \XMM2 723 AESENC \TMP3, \XMM3 724 AESENC \TMP3, \XMM4 725 movdqa HashKey_3_k(%rsp), \TMP5 726 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 727 movaps 0x50(%arg1), \TMP3 728 AESENC \TMP3, \XMM1 # Round 5 729 AESENC \TMP3, \XMM2 730 AESENC \TMP3, \XMM3 731 AESENC \TMP3, \XMM4 732 pxor \TMP1, \TMP4 733# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 734 pxor \XMM6, \XMM5 735 pxor \TMP2, \TMP6 736 movdqa \XMM7, \TMP1 737 pshufd $78, \XMM7, \TMP2 738 pxor \XMM7, \TMP2 739 movdqa HashKey_2(%rsp ), \TMP5 740 741 # Multiply TMP5 * HashKey using karatsuba 742 743 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 744 movaps 0x60(%arg1), \TMP3 745 AESENC \TMP3, \XMM1 # Round 6 746 AESENC \TMP3, \XMM2 747 AESENC \TMP3, \XMM3 748 AESENC \TMP3, \XMM4 749 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 750 movaps 0x70(%arg1), \TMP3 751 AESENC \TMP3, \XMM1 # Round 7 752 AESENC \TMP3, \XMM2 753 AESENC \TMP3, \XMM3 754 AESENC \TMP3, \XMM4 755 movdqa HashKey_2_k(%rsp), \TMP5 756 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 757 movaps 0x80(%arg1), \TMP3 758 AESENC \TMP3, \XMM1 # Round 8 759 AESENC \TMP3, \XMM2 760 AESENC \TMP3, \XMM3 761 AESENC \TMP3, \XMM4 762 pxor \TMP1, \TMP4 763# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 764 pxor \XMM7, \XMM5 765 pxor \TMP2, \TMP6 766 767 # Multiply XMM8 * HashKey 768 # XMM8 and TMP5 hold the values for the two operands 769 770 movdqa \XMM8, \TMP1 771 pshufd $78, \XMM8, \TMP2 772 pxor \XMM8, \TMP2 773 movdqa HashKey(%rsp), \TMP5 774 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 775 movaps 0x90(%arg1), \TMP3 776 AESENC \TMP3, \XMM1 # Round 9 777 AESENC \TMP3, \XMM2 778 AESENC \TMP3, \XMM3 779 AESENC \TMP3, \XMM4 780 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 781 lea 0xa0(%arg1),%r10 782 mov keysize,%eax 783 shr $2,%eax # 128->4, 192->6, 256->8 784 sub $4,%eax # 128->0, 192->2, 256->4 785 jz aes_loop_par_enc_done 786 787aes_loop_par_enc: 788 MOVADQ (%r10),\TMP3 789.irpc index, 1234 790 AESENC \TMP3, %xmm\index 791.endr 792 add $16,%r10 793 sub $1,%eax 794 jnz aes_loop_par_enc 795 796aes_loop_par_enc_done: 797 MOVADQ (%r10), \TMP3 798 AESENCLAST \TMP3, \XMM1 # Round 10 799 AESENCLAST \TMP3, \XMM2 800 AESENCLAST \TMP3, \XMM3 801 AESENCLAST \TMP3, \XMM4 802 movdqa HashKey_k(%rsp), \TMP5 803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 804 movdqu (%arg3,%r11,1), \TMP3 805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 806 movdqu 16(%arg3,%r11,1), \TMP3 807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 808 movdqu 32(%arg3,%r11,1), \TMP3 809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 810 movdqu 48(%arg3,%r11,1), \TMP3 811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 820 821 pxor \TMP4, \TMP1 822 pxor \XMM8, \XMM5 823 pxor \TMP6, \TMP2 824 pxor \TMP1, \TMP2 825 pxor \XMM5, \TMP2 826 movdqa \TMP2, \TMP3 827 pslldq $8, \TMP3 # left shift TMP3 2 DWs 828 psrldq $8, \TMP2 # right shift TMP2 2 DWs 829 pxor \TMP3, \XMM5 830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 831 832 # first phase of reduction 833 834 movdqa \XMM5, \TMP2 835 movdqa \XMM5, \TMP3 836 movdqa \XMM5, \TMP4 837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 838 pslld $31, \TMP2 # packed right shift << 31 839 pslld $30, \TMP3 # packed right shift << 30 840 pslld $25, \TMP4 # packed right shift << 25 841 pxor \TMP3, \TMP2 # xor the shifted versions 842 pxor \TMP4, \TMP2 843 movdqa \TMP2, \TMP5 844 psrldq $4, \TMP5 # right shift T5 1 DW 845 pslldq $12, \TMP2 # left shift T2 3 DWs 846 pxor \TMP2, \XMM5 847 848 # second phase of reduction 849 850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 851 movdqa \XMM5,\TMP3 852 movdqa \XMM5,\TMP4 853 psrld $1, \TMP2 # packed left shift >>1 854 psrld $2, \TMP3 # packed left shift >>2 855 psrld $7, \TMP4 # packed left shift >>7 856 pxor \TMP3,\TMP2 # xor the shifted versions 857 pxor \TMP4,\TMP2 858 pxor \TMP5, \TMP2 859 pxor \TMP2, \XMM5 860 pxor \TMP1, \XMM5 # result is in TMP1 861 862 pxor \XMM5, \XMM1 863.endm 864 865/* 866* decrypt 4 blocks at a time 867* ghash the 4 previously decrypted ciphertext blocks 868* arg1, %arg2, %arg3 are used as pointers only, not modified 869* %r11 is the data offset value 870*/ 871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 873 874 movdqa \XMM1, \XMM5 875 movdqa \XMM2, \XMM6 876 movdqa \XMM3, \XMM7 877 movdqa \XMM4, \XMM8 878 879 movdqa SHUF_MASK(%rip), %xmm15 880 # multiply TMP5 * HashKey using karatsuba 881 882 movdqa \XMM5, \TMP4 883 pshufd $78, \XMM5, \TMP6 884 pxor \XMM5, \TMP6 885 paddd ONE(%rip), \XMM0 # INCR CNT 886 movdqa HashKey_4(%rsp), \TMP5 887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 888 movdqa \XMM0, \XMM1 889 paddd ONE(%rip), \XMM0 # INCR CNT 890 movdqa \XMM0, \XMM2 891 paddd ONE(%rip), \XMM0 # INCR CNT 892 movdqa \XMM0, \XMM3 893 paddd ONE(%rip), \XMM0 # INCR CNT 894 movdqa \XMM0, \XMM4 895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 900 901 pxor (%arg1), \XMM1 902 pxor (%arg1), \XMM2 903 pxor (%arg1), \XMM3 904 pxor (%arg1), \XMM4 905 movdqa HashKey_4_k(%rsp), \TMP5 906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 907 movaps 0x10(%arg1), \TMP1 908 AESENC \TMP1, \XMM1 # Round 1 909 AESENC \TMP1, \XMM2 910 AESENC \TMP1, \XMM3 911 AESENC \TMP1, \XMM4 912 movaps 0x20(%arg1), \TMP1 913 AESENC \TMP1, \XMM1 # Round 2 914 AESENC \TMP1, \XMM2 915 AESENC \TMP1, \XMM3 916 AESENC \TMP1, \XMM4 917 movdqa \XMM6, \TMP1 918 pshufd $78, \XMM6, \TMP2 919 pxor \XMM6, \TMP2 920 movdqa HashKey_3(%rsp), \TMP5 921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 922 movaps 0x30(%arg1), \TMP3 923 AESENC \TMP3, \XMM1 # Round 3 924 AESENC \TMP3, \XMM2 925 AESENC \TMP3, \XMM3 926 AESENC \TMP3, \XMM4 927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 928 movaps 0x40(%arg1), \TMP3 929 AESENC \TMP3, \XMM1 # Round 4 930 AESENC \TMP3, \XMM2 931 AESENC \TMP3, \XMM3 932 AESENC \TMP3, \XMM4 933 movdqa HashKey_3_k(%rsp), \TMP5 934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 935 movaps 0x50(%arg1), \TMP3 936 AESENC \TMP3, \XMM1 # Round 5 937 AESENC \TMP3, \XMM2 938 AESENC \TMP3, \XMM3 939 AESENC \TMP3, \XMM4 940 pxor \TMP1, \TMP4 941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 942 pxor \XMM6, \XMM5 943 pxor \TMP2, \TMP6 944 movdqa \XMM7, \TMP1 945 pshufd $78, \XMM7, \TMP2 946 pxor \XMM7, \TMP2 947 movdqa HashKey_2(%rsp ), \TMP5 948 949 # Multiply TMP5 * HashKey using karatsuba 950 951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 952 movaps 0x60(%arg1), \TMP3 953 AESENC \TMP3, \XMM1 # Round 6 954 AESENC \TMP3, \XMM2 955 AESENC \TMP3, \XMM3 956 AESENC \TMP3, \XMM4 957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 958 movaps 0x70(%arg1), \TMP3 959 AESENC \TMP3, \XMM1 # Round 7 960 AESENC \TMP3, \XMM2 961 AESENC \TMP3, \XMM3 962 AESENC \TMP3, \XMM4 963 movdqa HashKey_2_k(%rsp), \TMP5 964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 965 movaps 0x80(%arg1), \TMP3 966 AESENC \TMP3, \XMM1 # Round 8 967 AESENC \TMP3, \XMM2 968 AESENC \TMP3, \XMM3 969 AESENC \TMP3, \XMM4 970 pxor \TMP1, \TMP4 971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 972 pxor \XMM7, \XMM5 973 pxor \TMP2, \TMP6 974 975 # Multiply XMM8 * HashKey 976 # XMM8 and TMP5 hold the values for the two operands 977 978 movdqa \XMM8, \TMP1 979 pshufd $78, \XMM8, \TMP2 980 pxor \XMM8, \TMP2 981 movdqa HashKey(%rsp), \TMP5 982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 983 movaps 0x90(%arg1), \TMP3 984 AESENC \TMP3, \XMM1 # Round 9 985 AESENC \TMP3, \XMM2 986 AESENC \TMP3, \XMM3 987 AESENC \TMP3, \XMM4 988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 989 lea 0xa0(%arg1),%r10 990 mov keysize,%eax 991 shr $2,%eax # 128->4, 192->6, 256->8 992 sub $4,%eax # 128->0, 192->2, 256->4 993 jz aes_loop_par_dec_done 994 995aes_loop_par_dec: 996 MOVADQ (%r10),\TMP3 997.irpc index, 1234 998 AESENC \TMP3, %xmm\index 999.endr 1000 add $16,%r10 1001 sub $1,%eax 1002 jnz aes_loop_par_dec 1003 1004aes_loop_par_dec_done: 1005 MOVADQ (%r10), \TMP3 1006 AESENCLAST \TMP3, \XMM1 # last round 1007 AESENCLAST \TMP3, \XMM2 1008 AESENCLAST \TMP3, \XMM3 1009 AESENCLAST \TMP3, \XMM4 1010 movdqa HashKey_k(%rsp), \TMP5 1011 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1012 movdqu (%arg3,%r11,1), \TMP3 1013 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1014 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1015 movdqa \TMP3, \XMM1 1016 movdqu 16(%arg3,%r11,1), \TMP3 1017 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1018 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1019 movdqa \TMP3, \XMM2 1020 movdqu 32(%arg3,%r11,1), \TMP3 1021 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1022 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1023 movdqa \TMP3, \XMM3 1024 movdqu 48(%arg3,%r11,1), \TMP3 1025 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1026 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1027 movdqa \TMP3, \XMM4 1028 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1029 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1030 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1031 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1032 1033 pxor \TMP4, \TMP1 1034 pxor \XMM8, \XMM5 1035 pxor \TMP6, \TMP2 1036 pxor \TMP1, \TMP2 1037 pxor \XMM5, \TMP2 1038 movdqa \TMP2, \TMP3 1039 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1040 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1041 pxor \TMP3, \XMM5 1042 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1043 1044 # first phase of reduction 1045 1046 movdqa \XMM5, \TMP2 1047 movdqa \XMM5, \TMP3 1048 movdqa \XMM5, \TMP4 1049# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1050 pslld $31, \TMP2 # packed right shift << 31 1051 pslld $30, \TMP3 # packed right shift << 30 1052 pslld $25, \TMP4 # packed right shift << 25 1053 pxor \TMP3, \TMP2 # xor the shifted versions 1054 pxor \TMP4, \TMP2 1055 movdqa \TMP2, \TMP5 1056 psrldq $4, \TMP5 # right shift T5 1 DW 1057 pslldq $12, \TMP2 # left shift T2 3 DWs 1058 pxor \TMP2, \XMM5 1059 1060 # second phase of reduction 1061 1062 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1063 movdqa \XMM5,\TMP3 1064 movdqa \XMM5,\TMP4 1065 psrld $1, \TMP2 # packed left shift >>1 1066 psrld $2, \TMP3 # packed left shift >>2 1067 psrld $7, \TMP4 # packed left shift >>7 1068 pxor \TMP3,\TMP2 # xor the shifted versions 1069 pxor \TMP4,\TMP2 1070 pxor \TMP5, \TMP2 1071 pxor \TMP2, \XMM5 1072 pxor \TMP1, \XMM5 # result is in TMP1 1073 1074 pxor \XMM5, \XMM1 1075.endm 1076 1077/* GHASH the last 4 ciphertext blocks. */ 1078.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1079TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1080 1081 # Multiply TMP6 * HashKey (using Karatsuba) 1082 1083 movdqa \XMM1, \TMP6 1084 pshufd $78, \XMM1, \TMP2 1085 pxor \XMM1, \TMP2 1086 movdqa HashKey_4(%rsp), \TMP5 1087 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1088 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1089 movdqa HashKey_4_k(%rsp), \TMP4 1090 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1091 movdqa \XMM1, \XMMDst 1092 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1093 1094 # Multiply TMP1 * HashKey (using Karatsuba) 1095 1096 movdqa \XMM2, \TMP1 1097 pshufd $78, \XMM2, \TMP2 1098 pxor \XMM2, \TMP2 1099 movdqa HashKey_3(%rsp), \TMP5 1100 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1101 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1102 movdqa HashKey_3_k(%rsp), \TMP4 1103 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1104 pxor \TMP1, \TMP6 1105 pxor \XMM2, \XMMDst 1106 pxor \TMP2, \XMM1 1107# results accumulated in TMP6, XMMDst, XMM1 1108 1109 # Multiply TMP1 * HashKey (using Karatsuba) 1110 1111 movdqa \XMM3, \TMP1 1112 pshufd $78, \XMM3, \TMP2 1113 pxor \XMM3, \TMP2 1114 movdqa HashKey_2(%rsp), \TMP5 1115 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1116 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1117 movdqa HashKey_2_k(%rsp), \TMP4 1118 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1119 pxor \TMP1, \TMP6 1120 pxor \XMM3, \XMMDst 1121 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1122 1123 # Multiply TMP1 * HashKey (using Karatsuba) 1124 movdqa \XMM4, \TMP1 1125 pshufd $78, \XMM4, \TMP2 1126 pxor \XMM4, \TMP2 1127 movdqa HashKey(%rsp), \TMP5 1128 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1129 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1130 movdqa HashKey_k(%rsp), \TMP4 1131 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1132 pxor \TMP1, \TMP6 1133 pxor \XMM4, \XMMDst 1134 pxor \XMM1, \TMP2 1135 pxor \TMP6, \TMP2 1136 pxor \XMMDst, \TMP2 1137 # middle section of the temp results combined as in karatsuba algorithm 1138 movdqa \TMP2, \TMP4 1139 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1140 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1141 pxor \TMP4, \XMMDst 1142 pxor \TMP2, \TMP6 1143# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1144 # first phase of the reduction 1145 movdqa \XMMDst, \TMP2 1146 movdqa \XMMDst, \TMP3 1147 movdqa \XMMDst, \TMP4 1148# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1149 pslld $31, \TMP2 # packed right shifting << 31 1150 pslld $30, \TMP3 # packed right shifting << 30 1151 pslld $25, \TMP4 # packed right shifting << 25 1152 pxor \TMP3, \TMP2 # xor the shifted versions 1153 pxor \TMP4, \TMP2 1154 movdqa \TMP2, \TMP7 1155 psrldq $4, \TMP7 # right shift TMP7 1 DW 1156 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1157 pxor \TMP2, \XMMDst 1158 1159 # second phase of the reduction 1160 movdqa \XMMDst, \TMP2 1161 # make 3 copies of XMMDst for doing 3 shift operations 1162 movdqa \XMMDst, \TMP3 1163 movdqa \XMMDst, \TMP4 1164 psrld $1, \TMP2 # packed left shift >> 1 1165 psrld $2, \TMP3 # packed left shift >> 2 1166 psrld $7, \TMP4 # packed left shift >> 7 1167 pxor \TMP3, \TMP2 # xor the shifted versions 1168 pxor \TMP4, \TMP2 1169 pxor \TMP7, \TMP2 1170 pxor \TMP2, \XMMDst 1171 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1172.endm 1173 1174 1175/* Encryption of a single block 1176* uses eax & r10 1177*/ 1178 1179.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1180 1181 pxor (%arg1), \XMM0 1182 mov keysize,%eax 1183 shr $2,%eax # 128->4, 192->6, 256->8 1184 add $5,%eax # 128->9, 192->11, 256->13 1185 lea 16(%arg1), %r10 # get first expanded key address 1186 1187_esb_loop_\@: 1188 MOVADQ (%r10),\TMP1 1189 AESENC \TMP1,\XMM0 1190 add $16,%r10 1191 sub $1,%eax 1192 jnz _esb_loop_\@ 1193 1194 MOVADQ (%r10),\TMP1 1195 AESENCLAST \TMP1,\XMM0 1196.endm 1197/***************************************************************************** 1198* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1199* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1200* const u8 *in, // Ciphertext input 1201* u64 plaintext_len, // Length of data in bytes for decryption. 1202* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1203* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1204* // concatenated with 0x00000001. 16-byte aligned pointer. 1205* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1206* const u8 *aad, // Additional Authentication Data (AAD) 1207* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1208* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1209* // given authentication tag and only return the plaintext if they match. 1210* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1211* // (most likely), 12 or 8. 1212* 1213* Assumptions: 1214* 1215* keys: 1216* keys are pre-expanded and aligned to 16 bytes. we are using the first 1217* set of 11 keys in the data structure void *aes_ctx 1218* 1219* iv: 1220* 0 1 2 3 1221* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1222* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1223* | Salt (From the SA) | 1224* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1225* | Initialization Vector | 1226* | (This is the sequence number from IPSec header) | 1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1228* | 0x1 | 1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1230* 1231* 1232* 1233* AAD: 1234* AAD padded to 128 bits with 0 1235* for example, assume AAD is a u32 vector 1236* 1237* if AAD is 8 bytes: 1238* AAD[3] = {A0, A1}; 1239* padded AAD in xmm register = {A1 A0 0 0} 1240* 1241* 0 1 2 3 1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1244* | SPI (A1) | 1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1246* | 32-bit Sequence Number (A0) | 1247* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1248* | 0x0 | 1249* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1250* 1251* AAD Format with 32-bit Sequence Number 1252* 1253* if AAD is 12 bytes: 1254* AAD[3] = {A0, A1, A2}; 1255* padded AAD in xmm register = {A2 A1 A0 0} 1256* 1257* 0 1 2 3 1258* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1259* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1260* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1261* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1262* | SPI (A2) | 1263* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1264* | 64-bit Extended Sequence Number {A1,A0} | 1265* | | 1266* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1267* | 0x0 | 1268* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1269* 1270* AAD Format with 64-bit Extended Sequence Number 1271* 1272* aadLen: 1273* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1274* The code supports 16 too but for other sizes, the code will fail. 1275* 1276* TLen: 1277* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1278* For other sizes, the code will fail. 1279* 1280* poly = x^128 + x^127 + x^126 + x^121 + 1 1281* 1282*****************************************************************************/ 1283ENTRY(aesni_gcm_dec) 1284 push %r12 1285 push %r13 1286 push %r14 1287 mov %rsp, %r14 1288/* 1289* states of %xmm registers %xmm6:%xmm15 not saved 1290* all %xmm registers are clobbered 1291*/ 1292 sub $VARIABLE_OFFSET, %rsp 1293 and $~63, %rsp # align rsp to 64 bytes 1294 mov %arg6, %r12 1295 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1296 movdqa SHUF_MASK(%rip), %xmm2 1297 PSHUFB_XMM %xmm2, %xmm13 1298 1299 1300# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1301 1302 movdqa %xmm13, %xmm2 1303 psllq $1, %xmm13 1304 psrlq $63, %xmm2 1305 movdqa %xmm2, %xmm1 1306 pslldq $8, %xmm2 1307 psrldq $8, %xmm1 1308 por %xmm2, %xmm13 1309 1310 # Reduction 1311 1312 pshufd $0x24, %xmm1, %xmm2 1313 pcmpeqd TWOONE(%rip), %xmm2 1314 pand POLY(%rip), %xmm2 1315 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1316 1317 1318 # Decrypt first few blocks 1319 1320 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1321 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1322 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1323 mov %r13, %r12 1324 and $(3<<4), %r12 1325 jz _initial_num_blocks_is_0_decrypt 1326 cmp $(2<<4), %r12 1327 jb _initial_num_blocks_is_1_decrypt 1328 je _initial_num_blocks_is_2_decrypt 1329_initial_num_blocks_is_3_decrypt: 1330 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1331%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1332 sub $48, %r13 1333 jmp _initial_blocks_decrypted 1334_initial_num_blocks_is_2_decrypt: 1335 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1336%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1337 sub $32, %r13 1338 jmp _initial_blocks_decrypted 1339_initial_num_blocks_is_1_decrypt: 1340 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1341%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1342 sub $16, %r13 1343 jmp _initial_blocks_decrypted 1344_initial_num_blocks_is_0_decrypt: 1345 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1346%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1347_initial_blocks_decrypted: 1348 cmp $0, %r13 1349 je _zero_cipher_left_decrypt 1350 sub $64, %r13 1351 je _four_cipher_left_decrypt 1352_decrypt_by_4: 1353 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1354%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1355 add $64, %r11 1356 sub $64, %r13 1357 jne _decrypt_by_4 1358_four_cipher_left_decrypt: 1359 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1360%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1361_zero_cipher_left_decrypt: 1362 mov %arg4, %r13 1363 and $15, %r13 # %r13 = arg4 (mod 16) 1364 je _multiple_of_16_bytes_decrypt 1365 1366 # Handle the last <16 byte block separately 1367 1368 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1369 movdqa SHUF_MASK(%rip), %xmm10 1370 PSHUFB_XMM %xmm10, %xmm0 1371 1372 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1373 sub $16, %r11 1374 add %r13, %r11 1375 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1376 lea SHIFT_MASK+16(%rip), %r12 1377 sub %r13, %r12 1378# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1379# (%r13 is the number of bytes in plaintext mod 16) 1380 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1381 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1382 1383 movdqa %xmm1, %xmm2 1384 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1385 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1386 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1387 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1388 pand %xmm1, %xmm2 1389 movdqa SHUF_MASK(%rip), %xmm10 1390 PSHUFB_XMM %xmm10 ,%xmm2 1391 1392 pxor %xmm2, %xmm8 1393 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1394 # GHASH computation for the last <16 byte block 1395 sub %r13, %r11 1396 add $16, %r11 1397 1398 # output %r13 bytes 1399 MOVQ_R64_XMM %xmm0, %rax 1400 cmp $8, %r13 1401 jle _less_than_8_bytes_left_decrypt 1402 mov %rax, (%arg2 , %r11, 1) 1403 add $8, %r11 1404 psrldq $8, %xmm0 1405 MOVQ_R64_XMM %xmm0, %rax 1406 sub $8, %r13 1407_less_than_8_bytes_left_decrypt: 1408 mov %al, (%arg2, %r11, 1) 1409 add $1, %r11 1410 shr $8, %rax 1411 sub $1, %r13 1412 jne _less_than_8_bytes_left_decrypt 1413_multiple_of_16_bytes_decrypt: 1414 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1415 shl $3, %r12 # convert into number of bits 1416 movd %r12d, %xmm15 # len(A) in %xmm15 1417 shl $3, %arg4 # len(C) in bits (*128) 1418 MOVQ_R64_XMM %arg4, %xmm1 1419 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1420 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1421 pxor %xmm15, %xmm8 1422 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1423 # final GHASH computation 1424 movdqa SHUF_MASK(%rip), %xmm10 1425 PSHUFB_XMM %xmm10, %xmm8 1426 1427 mov %arg5, %rax # %rax = *Y0 1428 movdqu (%rax), %xmm0 # %xmm0 = Y0 1429 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1430 pxor %xmm8, %xmm0 1431_return_T_decrypt: 1432 mov arg9, %r10 # %r10 = authTag 1433 mov arg10, %r11 # %r11 = auth_tag_len 1434 cmp $16, %r11 1435 je _T_16_decrypt 1436 cmp $12, %r11 1437 je _T_12_decrypt 1438_T_8_decrypt: 1439 MOVQ_R64_XMM %xmm0, %rax 1440 mov %rax, (%r10) 1441 jmp _return_T_done_decrypt 1442_T_12_decrypt: 1443 MOVQ_R64_XMM %xmm0, %rax 1444 mov %rax, (%r10) 1445 psrldq $8, %xmm0 1446 movd %xmm0, %eax 1447 mov %eax, 8(%r10) 1448 jmp _return_T_done_decrypt 1449_T_16_decrypt: 1450 movdqu %xmm0, (%r10) 1451_return_T_done_decrypt: 1452 mov %r14, %rsp 1453 pop %r14 1454 pop %r13 1455 pop %r12 1456 ret 1457ENDPROC(aesni_gcm_dec) 1458 1459 1460/***************************************************************************** 1461* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1462* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1463* const u8 *in, // Plaintext input 1464* u64 plaintext_len, // Length of data in bytes for encryption. 1465* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1466* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1467* // concatenated with 0x00000001. 16-byte aligned pointer. 1468* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1469* const u8 *aad, // Additional Authentication Data (AAD) 1470* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1471* u8 *auth_tag, // Authenticated Tag output. 1472* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1473* // 12 or 8. 1474* 1475* Assumptions: 1476* 1477* keys: 1478* keys are pre-expanded and aligned to 16 bytes. we are using the 1479* first set of 11 keys in the data structure void *aes_ctx 1480* 1481* 1482* iv: 1483* 0 1 2 3 1484* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1485* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1486* | Salt (From the SA) | 1487* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1488* | Initialization Vector | 1489* | (This is the sequence number from IPSec header) | 1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1491* | 0x1 | 1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1493* 1494* 1495* 1496* AAD: 1497* AAD padded to 128 bits with 0 1498* for example, assume AAD is a u32 vector 1499* 1500* if AAD is 8 bytes: 1501* AAD[3] = {A0, A1}; 1502* padded AAD in xmm register = {A1 A0 0 0} 1503* 1504* 0 1 2 3 1505* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1507* | SPI (A1) | 1508* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1509* | 32-bit Sequence Number (A0) | 1510* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1511* | 0x0 | 1512* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1513* 1514* AAD Format with 32-bit Sequence Number 1515* 1516* if AAD is 12 bytes: 1517* AAD[3] = {A0, A1, A2}; 1518* padded AAD in xmm register = {A2 A1 A0 0} 1519* 1520* 0 1 2 3 1521* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1522* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1523* | SPI (A2) | 1524* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1525* | 64-bit Extended Sequence Number {A1,A0} | 1526* | | 1527* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1528* | 0x0 | 1529* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1530* 1531* AAD Format with 64-bit Extended Sequence Number 1532* 1533* aadLen: 1534* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1535* The code supports 16 too but for other sizes, the code will fail. 1536* 1537* TLen: 1538* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1539* For other sizes, the code will fail. 1540* 1541* poly = x^128 + x^127 + x^126 + x^121 + 1 1542***************************************************************************/ 1543ENTRY(aesni_gcm_enc) 1544 push %r12 1545 push %r13 1546 push %r14 1547 mov %rsp, %r14 1548# 1549# states of %xmm registers %xmm6:%xmm15 not saved 1550# all %xmm registers are clobbered 1551# 1552 sub $VARIABLE_OFFSET, %rsp 1553 and $~63, %rsp 1554 mov %arg6, %r12 1555 movdqu (%r12), %xmm13 1556 movdqa SHUF_MASK(%rip), %xmm2 1557 PSHUFB_XMM %xmm2, %xmm13 1558 1559 1560# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1561 1562 movdqa %xmm13, %xmm2 1563 psllq $1, %xmm13 1564 psrlq $63, %xmm2 1565 movdqa %xmm2, %xmm1 1566 pslldq $8, %xmm2 1567 psrldq $8, %xmm1 1568 por %xmm2, %xmm13 1569 1570 # reduce HashKey<<1 1571 1572 pshufd $0x24, %xmm1, %xmm2 1573 pcmpeqd TWOONE(%rip), %xmm2 1574 pand POLY(%rip), %xmm2 1575 pxor %xmm2, %xmm13 1576 movdqa %xmm13, HashKey(%rsp) 1577 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1578 and $-16, %r13 1579 mov %r13, %r12 1580 1581 # Encrypt first few blocks 1582 1583 and $(3<<4), %r12 1584 jz _initial_num_blocks_is_0_encrypt 1585 cmp $(2<<4), %r12 1586 jb _initial_num_blocks_is_1_encrypt 1587 je _initial_num_blocks_is_2_encrypt 1588_initial_num_blocks_is_3_encrypt: 1589 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1590%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1591 sub $48, %r13 1592 jmp _initial_blocks_encrypted 1593_initial_num_blocks_is_2_encrypt: 1594 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1595%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1596 sub $32, %r13 1597 jmp _initial_blocks_encrypted 1598_initial_num_blocks_is_1_encrypt: 1599 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1600%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1601 sub $16, %r13 1602 jmp _initial_blocks_encrypted 1603_initial_num_blocks_is_0_encrypt: 1604 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1605%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1606_initial_blocks_encrypted: 1607 1608 # Main loop - Encrypt remaining blocks 1609 1610 cmp $0, %r13 1611 je _zero_cipher_left_encrypt 1612 sub $64, %r13 1613 je _four_cipher_left_encrypt 1614_encrypt_by_4_encrypt: 1615 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1616%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1617 add $64, %r11 1618 sub $64, %r13 1619 jne _encrypt_by_4_encrypt 1620_four_cipher_left_encrypt: 1621 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1622%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1623_zero_cipher_left_encrypt: 1624 mov %arg4, %r13 1625 and $15, %r13 # %r13 = arg4 (mod 16) 1626 je _multiple_of_16_bytes_encrypt 1627 1628 # Handle the last <16 Byte block separately 1629 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1630 movdqa SHUF_MASK(%rip), %xmm10 1631 PSHUFB_XMM %xmm10, %xmm0 1632 1633 1634 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1635 sub $16, %r11 1636 add %r13, %r11 1637 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1638 lea SHIFT_MASK+16(%rip), %r12 1639 sub %r13, %r12 1640 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1641 # (%r13 is the number of bytes in plaintext mod 16) 1642 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1643 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1644 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1645 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1646 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1647 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1648 movdqa SHUF_MASK(%rip), %xmm10 1649 PSHUFB_XMM %xmm10,%xmm0 1650 1651 pxor %xmm0, %xmm8 1652 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1653 # GHASH computation for the last <16 byte block 1654 sub %r13, %r11 1655 add $16, %r11 1656 1657 movdqa SHUF_MASK(%rip), %xmm10 1658 PSHUFB_XMM %xmm10, %xmm0 1659 1660 # shuffle xmm0 back to output as ciphertext 1661 1662 # Output %r13 bytes 1663 MOVQ_R64_XMM %xmm0, %rax 1664 cmp $8, %r13 1665 jle _less_than_8_bytes_left_encrypt 1666 mov %rax, (%arg2 , %r11, 1) 1667 add $8, %r11 1668 psrldq $8, %xmm0 1669 MOVQ_R64_XMM %xmm0, %rax 1670 sub $8, %r13 1671_less_than_8_bytes_left_encrypt: 1672 mov %al, (%arg2, %r11, 1) 1673 add $1, %r11 1674 shr $8, %rax 1675 sub $1, %r13 1676 jne _less_than_8_bytes_left_encrypt 1677_multiple_of_16_bytes_encrypt: 1678 mov arg8, %r12 # %r12 = addLen (number of bytes) 1679 shl $3, %r12 1680 movd %r12d, %xmm15 # len(A) in %xmm15 1681 shl $3, %arg4 # len(C) in bits (*128) 1682 MOVQ_R64_XMM %arg4, %xmm1 1683 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1684 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1685 pxor %xmm15, %xmm8 1686 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1687 # final GHASH computation 1688 movdqa SHUF_MASK(%rip), %xmm10 1689 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1690 1691 mov %arg5, %rax # %rax = *Y0 1692 movdqu (%rax), %xmm0 # %xmm0 = Y0 1693 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1694 pxor %xmm8, %xmm0 1695_return_T_encrypt: 1696 mov arg9, %r10 # %r10 = authTag 1697 mov arg10, %r11 # %r11 = auth_tag_len 1698 cmp $16, %r11 1699 je _T_16_encrypt 1700 cmp $12, %r11 1701 je _T_12_encrypt 1702_T_8_encrypt: 1703 MOVQ_R64_XMM %xmm0, %rax 1704 mov %rax, (%r10) 1705 jmp _return_T_done_encrypt 1706_T_12_encrypt: 1707 MOVQ_R64_XMM %xmm0, %rax 1708 mov %rax, (%r10) 1709 psrldq $8, %xmm0 1710 movd %xmm0, %eax 1711 mov %eax, 8(%r10) 1712 jmp _return_T_done_encrypt 1713_T_16_encrypt: 1714 movdqu %xmm0, (%r10) 1715_return_T_done_encrypt: 1716 mov %r14, %rsp 1717 pop %r14 1718 pop %r13 1719 pop %r12 1720 ret 1721ENDPROC(aesni_gcm_enc) 1722 1723#endif 1724 1725 1726.align 4 1727_key_expansion_128: 1728_key_expansion_256a: 1729 pshufd $0b11111111, %xmm1, %xmm1 1730 shufps $0b00010000, %xmm0, %xmm4 1731 pxor %xmm4, %xmm0 1732 shufps $0b10001100, %xmm0, %xmm4 1733 pxor %xmm4, %xmm0 1734 pxor %xmm1, %xmm0 1735 movaps %xmm0, (TKEYP) 1736 add $0x10, TKEYP 1737 ret 1738ENDPROC(_key_expansion_128) 1739ENDPROC(_key_expansion_256a) 1740 1741.align 4 1742_key_expansion_192a: 1743 pshufd $0b01010101, %xmm1, %xmm1 1744 shufps $0b00010000, %xmm0, %xmm4 1745 pxor %xmm4, %xmm0 1746 shufps $0b10001100, %xmm0, %xmm4 1747 pxor %xmm4, %xmm0 1748 pxor %xmm1, %xmm0 1749 1750 movaps %xmm2, %xmm5 1751 movaps %xmm2, %xmm6 1752 pslldq $4, %xmm5 1753 pshufd $0b11111111, %xmm0, %xmm3 1754 pxor %xmm3, %xmm2 1755 pxor %xmm5, %xmm2 1756 1757 movaps %xmm0, %xmm1 1758 shufps $0b01000100, %xmm0, %xmm6 1759 movaps %xmm6, (TKEYP) 1760 shufps $0b01001110, %xmm2, %xmm1 1761 movaps %xmm1, 0x10(TKEYP) 1762 add $0x20, TKEYP 1763 ret 1764ENDPROC(_key_expansion_192a) 1765 1766.align 4 1767_key_expansion_192b: 1768 pshufd $0b01010101, %xmm1, %xmm1 1769 shufps $0b00010000, %xmm0, %xmm4 1770 pxor %xmm4, %xmm0 1771 shufps $0b10001100, %xmm0, %xmm4 1772 pxor %xmm4, %xmm0 1773 pxor %xmm1, %xmm0 1774 1775 movaps %xmm2, %xmm5 1776 pslldq $4, %xmm5 1777 pshufd $0b11111111, %xmm0, %xmm3 1778 pxor %xmm3, %xmm2 1779 pxor %xmm5, %xmm2 1780 1781 movaps %xmm0, (TKEYP) 1782 add $0x10, TKEYP 1783 ret 1784ENDPROC(_key_expansion_192b) 1785 1786.align 4 1787_key_expansion_256b: 1788 pshufd $0b10101010, %xmm1, %xmm1 1789 shufps $0b00010000, %xmm2, %xmm4 1790 pxor %xmm4, %xmm2 1791 shufps $0b10001100, %xmm2, %xmm4 1792 pxor %xmm4, %xmm2 1793 pxor %xmm1, %xmm2 1794 movaps %xmm2, (TKEYP) 1795 add $0x10, TKEYP 1796 ret 1797ENDPROC(_key_expansion_256b) 1798 1799/* 1800 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1801 * unsigned int key_len) 1802 */ 1803ENTRY(aesni_set_key) 1804 FRAME_BEGIN 1805#ifndef __x86_64__ 1806 pushl KEYP 1807 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1808 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1809 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1810#endif 1811 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1812 movaps %xmm0, (KEYP) 1813 lea 0x10(KEYP), TKEYP # key addr 1814 movl %edx, 480(KEYP) 1815 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1816 cmp $24, %dl 1817 jb .Lenc_key128 1818 je .Lenc_key192 1819 movups 0x10(UKEYP), %xmm2 # other user key 1820 movaps %xmm2, (TKEYP) 1821 add $0x10, TKEYP 1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1823 call _key_expansion_256a 1824 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1825 call _key_expansion_256b 1826 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1827 call _key_expansion_256a 1828 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1829 call _key_expansion_256b 1830 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1831 call _key_expansion_256a 1832 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1833 call _key_expansion_256b 1834 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1835 call _key_expansion_256a 1836 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1837 call _key_expansion_256b 1838 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1839 call _key_expansion_256a 1840 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1841 call _key_expansion_256b 1842 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1843 call _key_expansion_256a 1844 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1845 call _key_expansion_256b 1846 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1847 call _key_expansion_256a 1848 jmp .Ldec_key 1849.Lenc_key192: 1850 movq 0x10(UKEYP), %xmm2 # other user key 1851 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1852 call _key_expansion_192a 1853 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1854 call _key_expansion_192b 1855 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1856 call _key_expansion_192a 1857 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1858 call _key_expansion_192b 1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1860 call _key_expansion_192a 1861 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1862 call _key_expansion_192b 1863 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1864 call _key_expansion_192a 1865 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1866 call _key_expansion_192b 1867 jmp .Ldec_key 1868.Lenc_key128: 1869 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1870 call _key_expansion_128 1871 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1872 call _key_expansion_128 1873 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1874 call _key_expansion_128 1875 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1876 call _key_expansion_128 1877 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1878 call _key_expansion_128 1879 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1880 call _key_expansion_128 1881 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1882 call _key_expansion_128 1883 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1884 call _key_expansion_128 1885 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1886 call _key_expansion_128 1887 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1888 call _key_expansion_128 1889.Ldec_key: 1890 sub $0x10, TKEYP 1891 movaps (KEYP), %xmm0 1892 movaps (TKEYP), %xmm1 1893 movaps %xmm0, 240(TKEYP) 1894 movaps %xmm1, 240(KEYP) 1895 add $0x10, KEYP 1896 lea 240-16(TKEYP), UKEYP 1897.align 4 1898.Ldec_key_loop: 1899 movaps (KEYP), %xmm0 1900 AESIMC %xmm0 %xmm1 1901 movaps %xmm1, (UKEYP) 1902 add $0x10, KEYP 1903 sub $0x10, UKEYP 1904 cmp TKEYP, KEYP 1905 jb .Ldec_key_loop 1906 xor AREG, AREG 1907#ifndef __x86_64__ 1908 popl KEYP 1909#endif 1910 FRAME_END 1911 ret 1912ENDPROC(aesni_set_key) 1913 1914/* 1915 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1916 */ 1917ENTRY(aesni_enc) 1918 FRAME_BEGIN 1919#ifndef __x86_64__ 1920 pushl KEYP 1921 pushl KLEN 1922 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1923 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1924 movl (FRAME_OFFSET+20)(%esp), INP # src 1925#endif 1926 movl 480(KEYP), KLEN # key length 1927 movups (INP), STATE # input 1928 call _aesni_enc1 1929 movups STATE, (OUTP) # output 1930#ifndef __x86_64__ 1931 popl KLEN 1932 popl KEYP 1933#endif 1934 FRAME_END 1935 ret 1936ENDPROC(aesni_enc) 1937 1938/* 1939 * _aesni_enc1: internal ABI 1940 * input: 1941 * KEYP: key struct pointer 1942 * KLEN: round count 1943 * STATE: initial state (input) 1944 * output: 1945 * STATE: finial state (output) 1946 * changed: 1947 * KEY 1948 * TKEYP (T1) 1949 */ 1950.align 4 1951_aesni_enc1: 1952 movaps (KEYP), KEY # key 1953 mov KEYP, TKEYP 1954 pxor KEY, STATE # round 0 1955 add $0x30, TKEYP 1956 cmp $24, KLEN 1957 jb .Lenc128 1958 lea 0x20(TKEYP), TKEYP 1959 je .Lenc192 1960 add $0x20, TKEYP 1961 movaps -0x60(TKEYP), KEY 1962 AESENC KEY STATE 1963 movaps -0x50(TKEYP), KEY 1964 AESENC KEY STATE 1965.align 4 1966.Lenc192: 1967 movaps -0x40(TKEYP), KEY 1968 AESENC KEY STATE 1969 movaps -0x30(TKEYP), KEY 1970 AESENC KEY STATE 1971.align 4 1972.Lenc128: 1973 movaps -0x20(TKEYP), KEY 1974 AESENC KEY STATE 1975 movaps -0x10(TKEYP), KEY 1976 AESENC KEY STATE 1977 movaps (TKEYP), KEY 1978 AESENC KEY STATE 1979 movaps 0x10(TKEYP), KEY 1980 AESENC KEY STATE 1981 movaps 0x20(TKEYP), KEY 1982 AESENC KEY STATE 1983 movaps 0x30(TKEYP), KEY 1984 AESENC KEY STATE 1985 movaps 0x40(TKEYP), KEY 1986 AESENC KEY STATE 1987 movaps 0x50(TKEYP), KEY 1988 AESENC KEY STATE 1989 movaps 0x60(TKEYP), KEY 1990 AESENC KEY STATE 1991 movaps 0x70(TKEYP), KEY 1992 AESENCLAST KEY STATE 1993 ret 1994ENDPROC(_aesni_enc1) 1995 1996/* 1997 * _aesni_enc4: internal ABI 1998 * input: 1999 * KEYP: key struct pointer 2000 * KLEN: round count 2001 * STATE1: initial state (input) 2002 * STATE2 2003 * STATE3 2004 * STATE4 2005 * output: 2006 * STATE1: finial state (output) 2007 * STATE2 2008 * STATE3 2009 * STATE4 2010 * changed: 2011 * KEY 2012 * TKEYP (T1) 2013 */ 2014.align 4 2015_aesni_enc4: 2016 movaps (KEYP), KEY # key 2017 mov KEYP, TKEYP 2018 pxor KEY, STATE1 # round 0 2019 pxor KEY, STATE2 2020 pxor KEY, STATE3 2021 pxor KEY, STATE4 2022 add $0x30, TKEYP 2023 cmp $24, KLEN 2024 jb .L4enc128 2025 lea 0x20(TKEYP), TKEYP 2026 je .L4enc192 2027 add $0x20, TKEYP 2028 movaps -0x60(TKEYP), KEY 2029 AESENC KEY STATE1 2030 AESENC KEY STATE2 2031 AESENC KEY STATE3 2032 AESENC KEY STATE4 2033 movaps -0x50(TKEYP), KEY 2034 AESENC KEY STATE1 2035 AESENC KEY STATE2 2036 AESENC KEY STATE3 2037 AESENC KEY STATE4 2038#.align 4 2039.L4enc192: 2040 movaps -0x40(TKEYP), KEY 2041 AESENC KEY STATE1 2042 AESENC KEY STATE2 2043 AESENC KEY STATE3 2044 AESENC KEY STATE4 2045 movaps -0x30(TKEYP), KEY 2046 AESENC KEY STATE1 2047 AESENC KEY STATE2 2048 AESENC KEY STATE3 2049 AESENC KEY STATE4 2050#.align 4 2051.L4enc128: 2052 movaps -0x20(TKEYP), KEY 2053 AESENC KEY STATE1 2054 AESENC KEY STATE2 2055 AESENC KEY STATE3 2056 AESENC KEY STATE4 2057 movaps -0x10(TKEYP), KEY 2058 AESENC KEY STATE1 2059 AESENC KEY STATE2 2060 AESENC KEY STATE3 2061 AESENC KEY STATE4 2062 movaps (TKEYP), KEY 2063 AESENC KEY STATE1 2064 AESENC KEY STATE2 2065 AESENC KEY STATE3 2066 AESENC KEY STATE4 2067 movaps 0x10(TKEYP), KEY 2068 AESENC KEY STATE1 2069 AESENC KEY STATE2 2070 AESENC KEY STATE3 2071 AESENC KEY STATE4 2072 movaps 0x20(TKEYP), KEY 2073 AESENC KEY STATE1 2074 AESENC KEY STATE2 2075 AESENC KEY STATE3 2076 AESENC KEY STATE4 2077 movaps 0x30(TKEYP), KEY 2078 AESENC KEY STATE1 2079 AESENC KEY STATE2 2080 AESENC KEY STATE3 2081 AESENC KEY STATE4 2082 movaps 0x40(TKEYP), KEY 2083 AESENC KEY STATE1 2084 AESENC KEY STATE2 2085 AESENC KEY STATE3 2086 AESENC KEY STATE4 2087 movaps 0x50(TKEYP), KEY 2088 AESENC KEY STATE1 2089 AESENC KEY STATE2 2090 AESENC KEY STATE3 2091 AESENC KEY STATE4 2092 movaps 0x60(TKEYP), KEY 2093 AESENC KEY STATE1 2094 AESENC KEY STATE2 2095 AESENC KEY STATE3 2096 AESENC KEY STATE4 2097 movaps 0x70(TKEYP), KEY 2098 AESENCLAST KEY STATE1 # last round 2099 AESENCLAST KEY STATE2 2100 AESENCLAST KEY STATE3 2101 AESENCLAST KEY STATE4 2102 ret 2103ENDPROC(_aesni_enc4) 2104 2105/* 2106 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2107 */ 2108ENTRY(aesni_dec) 2109 FRAME_BEGIN 2110#ifndef __x86_64__ 2111 pushl KEYP 2112 pushl KLEN 2113 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2114 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2115 movl (FRAME_OFFSET+20)(%esp), INP # src 2116#endif 2117 mov 480(KEYP), KLEN # key length 2118 add $240, KEYP 2119 movups (INP), STATE # input 2120 call _aesni_dec1 2121 movups STATE, (OUTP) #output 2122#ifndef __x86_64__ 2123 popl KLEN 2124 popl KEYP 2125#endif 2126 FRAME_END 2127 ret 2128ENDPROC(aesni_dec) 2129 2130/* 2131 * _aesni_dec1: internal ABI 2132 * input: 2133 * KEYP: key struct pointer 2134 * KLEN: key length 2135 * STATE: initial state (input) 2136 * output: 2137 * STATE: finial state (output) 2138 * changed: 2139 * KEY 2140 * TKEYP (T1) 2141 */ 2142.align 4 2143_aesni_dec1: 2144 movaps (KEYP), KEY # key 2145 mov KEYP, TKEYP 2146 pxor KEY, STATE # round 0 2147 add $0x30, TKEYP 2148 cmp $24, KLEN 2149 jb .Ldec128 2150 lea 0x20(TKEYP), TKEYP 2151 je .Ldec192 2152 add $0x20, TKEYP 2153 movaps -0x60(TKEYP), KEY 2154 AESDEC KEY STATE 2155 movaps -0x50(TKEYP), KEY 2156 AESDEC KEY STATE 2157.align 4 2158.Ldec192: 2159 movaps -0x40(TKEYP), KEY 2160 AESDEC KEY STATE 2161 movaps -0x30(TKEYP), KEY 2162 AESDEC KEY STATE 2163.align 4 2164.Ldec128: 2165 movaps -0x20(TKEYP), KEY 2166 AESDEC KEY STATE 2167 movaps -0x10(TKEYP), KEY 2168 AESDEC KEY STATE 2169 movaps (TKEYP), KEY 2170 AESDEC KEY STATE 2171 movaps 0x10(TKEYP), KEY 2172 AESDEC KEY STATE 2173 movaps 0x20(TKEYP), KEY 2174 AESDEC KEY STATE 2175 movaps 0x30(TKEYP), KEY 2176 AESDEC KEY STATE 2177 movaps 0x40(TKEYP), KEY 2178 AESDEC KEY STATE 2179 movaps 0x50(TKEYP), KEY 2180 AESDEC KEY STATE 2181 movaps 0x60(TKEYP), KEY 2182 AESDEC KEY STATE 2183 movaps 0x70(TKEYP), KEY 2184 AESDECLAST KEY STATE 2185 ret 2186ENDPROC(_aesni_dec1) 2187 2188/* 2189 * _aesni_dec4: internal ABI 2190 * input: 2191 * KEYP: key struct pointer 2192 * KLEN: key length 2193 * STATE1: initial state (input) 2194 * STATE2 2195 * STATE3 2196 * STATE4 2197 * output: 2198 * STATE1: finial state (output) 2199 * STATE2 2200 * STATE3 2201 * STATE4 2202 * changed: 2203 * KEY 2204 * TKEYP (T1) 2205 */ 2206.align 4 2207_aesni_dec4: 2208 movaps (KEYP), KEY # key 2209 mov KEYP, TKEYP 2210 pxor KEY, STATE1 # round 0 2211 pxor KEY, STATE2 2212 pxor KEY, STATE3 2213 pxor KEY, STATE4 2214 add $0x30, TKEYP 2215 cmp $24, KLEN 2216 jb .L4dec128 2217 lea 0x20(TKEYP), TKEYP 2218 je .L4dec192 2219 add $0x20, TKEYP 2220 movaps -0x60(TKEYP), KEY 2221 AESDEC KEY STATE1 2222 AESDEC KEY STATE2 2223 AESDEC KEY STATE3 2224 AESDEC KEY STATE4 2225 movaps -0x50(TKEYP), KEY 2226 AESDEC KEY STATE1 2227 AESDEC KEY STATE2 2228 AESDEC KEY STATE3 2229 AESDEC KEY STATE4 2230.align 4 2231.L4dec192: 2232 movaps -0x40(TKEYP), KEY 2233 AESDEC KEY STATE1 2234 AESDEC KEY STATE2 2235 AESDEC KEY STATE3 2236 AESDEC KEY STATE4 2237 movaps -0x30(TKEYP), KEY 2238 AESDEC KEY STATE1 2239 AESDEC KEY STATE2 2240 AESDEC KEY STATE3 2241 AESDEC KEY STATE4 2242.align 4 2243.L4dec128: 2244 movaps -0x20(TKEYP), KEY 2245 AESDEC KEY STATE1 2246 AESDEC KEY STATE2 2247 AESDEC KEY STATE3 2248 AESDEC KEY STATE4 2249 movaps -0x10(TKEYP), KEY 2250 AESDEC KEY STATE1 2251 AESDEC KEY STATE2 2252 AESDEC KEY STATE3 2253 AESDEC KEY STATE4 2254 movaps (TKEYP), KEY 2255 AESDEC KEY STATE1 2256 AESDEC KEY STATE2 2257 AESDEC KEY STATE3 2258 AESDEC KEY STATE4 2259 movaps 0x10(TKEYP), KEY 2260 AESDEC KEY STATE1 2261 AESDEC KEY STATE2 2262 AESDEC KEY STATE3 2263 AESDEC KEY STATE4 2264 movaps 0x20(TKEYP), KEY 2265 AESDEC KEY STATE1 2266 AESDEC KEY STATE2 2267 AESDEC KEY STATE3 2268 AESDEC KEY STATE4 2269 movaps 0x30(TKEYP), KEY 2270 AESDEC KEY STATE1 2271 AESDEC KEY STATE2 2272 AESDEC KEY STATE3 2273 AESDEC KEY STATE4 2274 movaps 0x40(TKEYP), KEY 2275 AESDEC KEY STATE1 2276 AESDEC KEY STATE2 2277 AESDEC KEY STATE3 2278 AESDEC KEY STATE4 2279 movaps 0x50(TKEYP), KEY 2280 AESDEC KEY STATE1 2281 AESDEC KEY STATE2 2282 AESDEC KEY STATE3 2283 AESDEC KEY STATE4 2284 movaps 0x60(TKEYP), KEY 2285 AESDEC KEY STATE1 2286 AESDEC KEY STATE2 2287 AESDEC KEY STATE3 2288 AESDEC KEY STATE4 2289 movaps 0x70(TKEYP), KEY 2290 AESDECLAST KEY STATE1 # last round 2291 AESDECLAST KEY STATE2 2292 AESDECLAST KEY STATE3 2293 AESDECLAST KEY STATE4 2294 ret 2295ENDPROC(_aesni_dec4) 2296 2297/* 2298 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2299 * size_t len) 2300 */ 2301ENTRY(aesni_ecb_enc) 2302 FRAME_BEGIN 2303#ifndef __x86_64__ 2304 pushl LEN 2305 pushl KEYP 2306 pushl KLEN 2307 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2308 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2309 movl (FRAME_OFFSET+24)(%esp), INP # src 2310 movl (FRAME_OFFSET+28)(%esp), LEN # len 2311#endif 2312 test LEN, LEN # check length 2313 jz .Lecb_enc_ret 2314 mov 480(KEYP), KLEN 2315 cmp $16, LEN 2316 jb .Lecb_enc_ret 2317 cmp $64, LEN 2318 jb .Lecb_enc_loop1 2319.align 4 2320.Lecb_enc_loop4: 2321 movups (INP), STATE1 2322 movups 0x10(INP), STATE2 2323 movups 0x20(INP), STATE3 2324 movups 0x30(INP), STATE4 2325 call _aesni_enc4 2326 movups STATE1, (OUTP) 2327 movups STATE2, 0x10(OUTP) 2328 movups STATE3, 0x20(OUTP) 2329 movups STATE4, 0x30(OUTP) 2330 sub $64, LEN 2331 add $64, INP 2332 add $64, OUTP 2333 cmp $64, LEN 2334 jge .Lecb_enc_loop4 2335 cmp $16, LEN 2336 jb .Lecb_enc_ret 2337.align 4 2338.Lecb_enc_loop1: 2339 movups (INP), STATE1 2340 call _aesni_enc1 2341 movups STATE1, (OUTP) 2342 sub $16, LEN 2343 add $16, INP 2344 add $16, OUTP 2345 cmp $16, LEN 2346 jge .Lecb_enc_loop1 2347.Lecb_enc_ret: 2348#ifndef __x86_64__ 2349 popl KLEN 2350 popl KEYP 2351 popl LEN 2352#endif 2353 FRAME_END 2354 ret 2355ENDPROC(aesni_ecb_enc) 2356 2357/* 2358 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2359 * size_t len); 2360 */ 2361ENTRY(aesni_ecb_dec) 2362 FRAME_BEGIN 2363#ifndef __x86_64__ 2364 pushl LEN 2365 pushl KEYP 2366 pushl KLEN 2367 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2368 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2369 movl (FRAME_OFFSET+24)(%esp), INP # src 2370 movl (FRAME_OFFSET+28)(%esp), LEN # len 2371#endif 2372 test LEN, LEN 2373 jz .Lecb_dec_ret 2374 mov 480(KEYP), KLEN 2375 add $240, KEYP 2376 cmp $16, LEN 2377 jb .Lecb_dec_ret 2378 cmp $64, LEN 2379 jb .Lecb_dec_loop1 2380.align 4 2381.Lecb_dec_loop4: 2382 movups (INP), STATE1 2383 movups 0x10(INP), STATE2 2384 movups 0x20(INP), STATE3 2385 movups 0x30(INP), STATE4 2386 call _aesni_dec4 2387 movups STATE1, (OUTP) 2388 movups STATE2, 0x10(OUTP) 2389 movups STATE3, 0x20(OUTP) 2390 movups STATE4, 0x30(OUTP) 2391 sub $64, LEN 2392 add $64, INP 2393 add $64, OUTP 2394 cmp $64, LEN 2395 jge .Lecb_dec_loop4 2396 cmp $16, LEN 2397 jb .Lecb_dec_ret 2398.align 4 2399.Lecb_dec_loop1: 2400 movups (INP), STATE1 2401 call _aesni_dec1 2402 movups STATE1, (OUTP) 2403 sub $16, LEN 2404 add $16, INP 2405 add $16, OUTP 2406 cmp $16, LEN 2407 jge .Lecb_dec_loop1 2408.Lecb_dec_ret: 2409#ifndef __x86_64__ 2410 popl KLEN 2411 popl KEYP 2412 popl LEN 2413#endif 2414 FRAME_END 2415 ret 2416ENDPROC(aesni_ecb_dec) 2417 2418/* 2419 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2420 * size_t len, u8 *iv) 2421 */ 2422ENTRY(aesni_cbc_enc) 2423 FRAME_BEGIN 2424#ifndef __x86_64__ 2425 pushl IVP 2426 pushl LEN 2427 pushl KEYP 2428 pushl KLEN 2429 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2430 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2431 movl (FRAME_OFFSET+28)(%esp), INP # src 2432 movl (FRAME_OFFSET+32)(%esp), LEN # len 2433 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2434#endif 2435 cmp $16, LEN 2436 jb .Lcbc_enc_ret 2437 mov 480(KEYP), KLEN 2438 movups (IVP), STATE # load iv as initial state 2439.align 4 2440.Lcbc_enc_loop: 2441 movups (INP), IN # load input 2442 pxor IN, STATE 2443 call _aesni_enc1 2444 movups STATE, (OUTP) # store output 2445 sub $16, LEN 2446 add $16, INP 2447 add $16, OUTP 2448 cmp $16, LEN 2449 jge .Lcbc_enc_loop 2450 movups STATE, (IVP) 2451.Lcbc_enc_ret: 2452#ifndef __x86_64__ 2453 popl KLEN 2454 popl KEYP 2455 popl LEN 2456 popl IVP 2457#endif 2458 FRAME_END 2459 ret 2460ENDPROC(aesni_cbc_enc) 2461 2462/* 2463 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2464 * size_t len, u8 *iv) 2465 */ 2466ENTRY(aesni_cbc_dec) 2467 FRAME_BEGIN 2468#ifndef __x86_64__ 2469 pushl IVP 2470 pushl LEN 2471 pushl KEYP 2472 pushl KLEN 2473 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2474 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2475 movl (FRAME_OFFSET+28)(%esp), INP # src 2476 movl (FRAME_OFFSET+32)(%esp), LEN # len 2477 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2478#endif 2479 cmp $16, LEN 2480 jb .Lcbc_dec_just_ret 2481 mov 480(KEYP), KLEN 2482 add $240, KEYP 2483 movups (IVP), IV 2484 cmp $64, LEN 2485 jb .Lcbc_dec_loop1 2486.align 4 2487.Lcbc_dec_loop4: 2488 movups (INP), IN1 2489 movaps IN1, STATE1 2490 movups 0x10(INP), IN2 2491 movaps IN2, STATE2 2492#ifdef __x86_64__ 2493 movups 0x20(INP), IN3 2494 movaps IN3, STATE3 2495 movups 0x30(INP), IN4 2496 movaps IN4, STATE4 2497#else 2498 movups 0x20(INP), IN1 2499 movaps IN1, STATE3 2500 movups 0x30(INP), IN2 2501 movaps IN2, STATE4 2502#endif 2503 call _aesni_dec4 2504 pxor IV, STATE1 2505#ifdef __x86_64__ 2506 pxor IN1, STATE2 2507 pxor IN2, STATE3 2508 pxor IN3, STATE4 2509 movaps IN4, IV 2510#else 2511 pxor IN1, STATE4 2512 movaps IN2, IV 2513 movups (INP), IN1 2514 pxor IN1, STATE2 2515 movups 0x10(INP), IN2 2516 pxor IN2, STATE3 2517#endif 2518 movups STATE1, (OUTP) 2519 movups STATE2, 0x10(OUTP) 2520 movups STATE3, 0x20(OUTP) 2521 movups STATE4, 0x30(OUTP) 2522 sub $64, LEN 2523 add $64, INP 2524 add $64, OUTP 2525 cmp $64, LEN 2526 jge .Lcbc_dec_loop4 2527 cmp $16, LEN 2528 jb .Lcbc_dec_ret 2529.align 4 2530.Lcbc_dec_loop1: 2531 movups (INP), IN 2532 movaps IN, STATE 2533 call _aesni_dec1 2534 pxor IV, STATE 2535 movups STATE, (OUTP) 2536 movaps IN, IV 2537 sub $16, LEN 2538 add $16, INP 2539 add $16, OUTP 2540 cmp $16, LEN 2541 jge .Lcbc_dec_loop1 2542.Lcbc_dec_ret: 2543 movups IV, (IVP) 2544.Lcbc_dec_just_ret: 2545#ifndef __x86_64__ 2546 popl KLEN 2547 popl KEYP 2548 popl LEN 2549 popl IVP 2550#endif 2551 FRAME_END 2552 ret 2553ENDPROC(aesni_cbc_dec) 2554 2555#ifdef __x86_64__ 2556.pushsection .rodata 2557.align 16 2558.Lbswap_mask: 2559 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2560.popsection 2561 2562/* 2563 * _aesni_inc_init: internal ABI 2564 * setup registers used by _aesni_inc 2565 * input: 2566 * IV 2567 * output: 2568 * CTR: == IV, in little endian 2569 * TCTR_LOW: == lower qword of CTR 2570 * INC: == 1, in little endian 2571 * BSWAP_MASK == endian swapping mask 2572 */ 2573.align 4 2574_aesni_inc_init: 2575 movaps .Lbswap_mask, BSWAP_MASK 2576 movaps IV, CTR 2577 PSHUFB_XMM BSWAP_MASK CTR 2578 mov $1, TCTR_LOW 2579 MOVQ_R64_XMM TCTR_LOW INC 2580 MOVQ_R64_XMM CTR TCTR_LOW 2581 ret 2582ENDPROC(_aesni_inc_init) 2583 2584/* 2585 * _aesni_inc: internal ABI 2586 * Increase IV by 1, IV is in big endian 2587 * input: 2588 * IV 2589 * CTR: == IV, in little endian 2590 * TCTR_LOW: == lower qword of CTR 2591 * INC: == 1, in little endian 2592 * BSWAP_MASK == endian swapping mask 2593 * output: 2594 * IV: Increase by 1 2595 * changed: 2596 * CTR: == output IV, in little endian 2597 * TCTR_LOW: == lower qword of CTR 2598 */ 2599.align 4 2600_aesni_inc: 2601 paddq INC, CTR 2602 add $1, TCTR_LOW 2603 jnc .Linc_low 2604 pslldq $8, INC 2605 paddq INC, CTR 2606 psrldq $8, INC 2607.Linc_low: 2608 movaps CTR, IV 2609 PSHUFB_XMM BSWAP_MASK IV 2610 ret 2611ENDPROC(_aesni_inc) 2612 2613/* 2614 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2615 * size_t len, u8 *iv) 2616 */ 2617ENTRY(aesni_ctr_enc) 2618 FRAME_BEGIN 2619 cmp $16, LEN 2620 jb .Lctr_enc_just_ret 2621 mov 480(KEYP), KLEN 2622 movups (IVP), IV 2623 call _aesni_inc_init 2624 cmp $64, LEN 2625 jb .Lctr_enc_loop1 2626.align 4 2627.Lctr_enc_loop4: 2628 movaps IV, STATE1 2629 call _aesni_inc 2630 movups (INP), IN1 2631 movaps IV, STATE2 2632 call _aesni_inc 2633 movups 0x10(INP), IN2 2634 movaps IV, STATE3 2635 call _aesni_inc 2636 movups 0x20(INP), IN3 2637 movaps IV, STATE4 2638 call _aesni_inc 2639 movups 0x30(INP), IN4 2640 call _aesni_enc4 2641 pxor IN1, STATE1 2642 movups STATE1, (OUTP) 2643 pxor IN2, STATE2 2644 movups STATE2, 0x10(OUTP) 2645 pxor IN3, STATE3 2646 movups STATE3, 0x20(OUTP) 2647 pxor IN4, STATE4 2648 movups STATE4, 0x30(OUTP) 2649 sub $64, LEN 2650 add $64, INP 2651 add $64, OUTP 2652 cmp $64, LEN 2653 jge .Lctr_enc_loop4 2654 cmp $16, LEN 2655 jb .Lctr_enc_ret 2656.align 4 2657.Lctr_enc_loop1: 2658 movaps IV, STATE 2659 call _aesni_inc 2660 movups (INP), IN 2661 call _aesni_enc1 2662 pxor IN, STATE 2663 movups STATE, (OUTP) 2664 sub $16, LEN 2665 add $16, INP 2666 add $16, OUTP 2667 cmp $16, LEN 2668 jge .Lctr_enc_loop1 2669.Lctr_enc_ret: 2670 movups IV, (IVP) 2671.Lctr_enc_just_ret: 2672 FRAME_END 2673 ret 2674ENDPROC(aesni_ctr_enc) 2675 2676/* 2677 * _aesni_gf128mul_x_ble: internal ABI 2678 * Multiply in GF(2^128) for XTS IVs 2679 * input: 2680 * IV: current IV 2681 * GF128MUL_MASK == mask with 0x87 and 0x01 2682 * output: 2683 * IV: next IV 2684 * changed: 2685 * CTR: == temporary value 2686 */ 2687#define _aesni_gf128mul_x_ble() \ 2688 pshufd $0x13, IV, CTR; \ 2689 paddq IV, IV; \ 2690 psrad $31, CTR; \ 2691 pand GF128MUL_MASK, CTR; \ 2692 pxor CTR, IV; 2693 2694/* 2695 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2696 * bool enc, u8 *iv) 2697 */ 2698ENTRY(aesni_xts_crypt8) 2699 FRAME_BEGIN 2700 cmpb $0, %cl 2701 movl $0, %ecx 2702 movl $240, %r10d 2703 leaq _aesni_enc4, %r11 2704 leaq _aesni_dec4, %rax 2705 cmovel %r10d, %ecx 2706 cmoveq %rax, %r11 2707 2708 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2709 movups (IVP), IV 2710 2711 mov 480(KEYP), KLEN 2712 addq %rcx, KEYP 2713 2714 movdqa IV, STATE1 2715 movdqu 0x00(INP), INC 2716 pxor INC, STATE1 2717 movdqu IV, 0x00(OUTP) 2718 2719 _aesni_gf128mul_x_ble() 2720 movdqa IV, STATE2 2721 movdqu 0x10(INP), INC 2722 pxor INC, STATE2 2723 movdqu IV, 0x10(OUTP) 2724 2725 _aesni_gf128mul_x_ble() 2726 movdqa IV, STATE3 2727 movdqu 0x20(INP), INC 2728 pxor INC, STATE3 2729 movdqu IV, 0x20(OUTP) 2730 2731 _aesni_gf128mul_x_ble() 2732 movdqa IV, STATE4 2733 movdqu 0x30(INP), INC 2734 pxor INC, STATE4 2735 movdqu IV, 0x30(OUTP) 2736 2737 call *%r11 2738 2739 movdqu 0x00(OUTP), INC 2740 pxor INC, STATE1 2741 movdqu STATE1, 0x00(OUTP) 2742 2743 _aesni_gf128mul_x_ble() 2744 movdqa IV, STATE1 2745 movdqu 0x40(INP), INC 2746 pxor INC, STATE1 2747 movdqu IV, 0x40(OUTP) 2748 2749 movdqu 0x10(OUTP), INC 2750 pxor INC, STATE2 2751 movdqu STATE2, 0x10(OUTP) 2752 2753 _aesni_gf128mul_x_ble() 2754 movdqa IV, STATE2 2755 movdqu 0x50(INP), INC 2756 pxor INC, STATE2 2757 movdqu IV, 0x50(OUTP) 2758 2759 movdqu 0x20(OUTP), INC 2760 pxor INC, STATE3 2761 movdqu STATE3, 0x20(OUTP) 2762 2763 _aesni_gf128mul_x_ble() 2764 movdqa IV, STATE3 2765 movdqu 0x60(INP), INC 2766 pxor INC, STATE3 2767 movdqu IV, 0x60(OUTP) 2768 2769 movdqu 0x30(OUTP), INC 2770 pxor INC, STATE4 2771 movdqu STATE4, 0x30(OUTP) 2772 2773 _aesni_gf128mul_x_ble() 2774 movdqa IV, STATE4 2775 movdqu 0x70(INP), INC 2776 pxor INC, STATE4 2777 movdqu IV, 0x70(OUTP) 2778 2779 _aesni_gf128mul_x_ble() 2780 movups IV, (IVP) 2781 2782 call *%r11 2783 2784 movdqu 0x40(OUTP), INC 2785 pxor INC, STATE1 2786 movdqu STATE1, 0x40(OUTP) 2787 2788 movdqu 0x50(OUTP), INC 2789 pxor INC, STATE2 2790 movdqu STATE2, 0x50(OUTP) 2791 2792 movdqu 0x60(OUTP), INC 2793 pxor INC, STATE3 2794 movdqu STATE3, 0x60(OUTP) 2795 2796 movdqu 0x70(OUTP), INC 2797 pxor INC, STATE4 2798 movdqu STATE4, 0x70(OUTP) 2799 2800 FRAME_END 2801 ret 2802ENDPROC(aesni_xts_crypt8) 2803 2804#endif 2805