1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34 35#ifdef __x86_64__ 36.data 37POLY: .octa 0xC2000000000000000000000000000001 38TWOONE: .octa 0x00000001000000000000000000000001 39 40# order of these constants should not change. 41# more specifically, ALL_F should follow SHIFT_MASK, 42# and ZERO should follow ALL_F 43 44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 45MASK1: .octa 0x0000000000000000ffffffffffffffff 46MASK2: .octa 0xffffffffffffffff0000000000000000 47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 48ALL_F: .octa 0xffffffffffffffffffffffffffffffff 49ZERO: .octa 0x00000000000000000000000000000000 50ONE: .octa 0x00000000000000000000000000000001 51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 52dec: .octa 0x1 53enc: .octa 0x2 54 55 56.text 57 58 59#define STACK_OFFSET 8*3 60#define HashKey 16*0 // store HashKey <<1 mod poly here 61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 65 // bits of HashKey <<1 mod poly here 66 //(for Karatsuba purposes) 67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 68 // bits of HashKey^2 <<1 mod poly here 69 // (for Karatsuba purposes) 70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 71 // bits of HashKey^3 <<1 mod poly here 72 // (for Karatsuba purposes) 73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 74 // bits of HashKey^4 <<1 mod poly here 75 // (for Karatsuba purposes) 76#define VARIABLE_OFFSET 16*8 77 78#define arg1 rdi 79#define arg2 rsi 80#define arg3 rdx 81#define arg4 rcx 82#define arg5 r8 83#define arg6 r9 84#define arg7 STACK_OFFSET+8(%r14) 85#define arg8 STACK_OFFSET+16(%r14) 86#define arg9 STACK_OFFSET+24(%r14) 87#define arg10 STACK_OFFSET+32(%r14) 88#endif 89 90 91#define STATE1 %xmm0 92#define STATE2 %xmm4 93#define STATE3 %xmm5 94#define STATE4 %xmm6 95#define STATE STATE1 96#define IN1 %xmm1 97#define IN2 %xmm7 98#define IN3 %xmm8 99#define IN4 %xmm9 100#define IN IN1 101#define KEY %xmm2 102#define IV %xmm3 103 104#define BSWAP_MASK %xmm10 105#define CTR %xmm11 106#define INC %xmm12 107 108#ifdef __x86_64__ 109#define AREG %rax 110#define KEYP %rdi 111#define OUTP %rsi 112#define UKEYP OUTP 113#define INP %rdx 114#define LEN %rcx 115#define IVP %r8 116#define KLEN %r9d 117#define T1 %r10 118#define TKEYP T1 119#define T2 %r11 120#define TCTR_LOW T2 121#else 122#define AREG %eax 123#define KEYP %edi 124#define OUTP AREG 125#define UKEYP OUTP 126#define INP %edx 127#define LEN %esi 128#define IVP %ebp 129#define KLEN %ebx 130#define T1 %ecx 131#define TKEYP T1 132#endif 133 134 135#ifdef __x86_64__ 136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 137* 138* 139* Input: A and B (128-bits each, bit-reflected) 140* Output: C = A*B*x mod poly, (i.e. >>1 ) 141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 143* 144*/ 145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 146 movdqa \GH, \TMP1 147 pshufd $78, \GH, \TMP2 148 pshufd $78, \HK, \TMP3 149 pxor \GH, \TMP2 # TMP2 = a1+a0 150 pxor \HK, \TMP3 # TMP3 = b1+b0 151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 154 pxor \GH, \TMP2 155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 156 movdqa \TMP2, \TMP3 157 pslldq $8, \TMP3 # left shift TMP3 2 DWs 158 psrldq $8, \TMP2 # right shift TMP2 2 DWs 159 pxor \TMP3, \GH 160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 161 162 # first phase of the reduction 163 164 movdqa \GH, \TMP2 165 movdqa \GH, \TMP3 166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 167 # in in order to perform 168 # independent shifts 169 pslld $31, \TMP2 # packed right shift <<31 170 pslld $30, \TMP3 # packed right shift <<30 171 pslld $25, \TMP4 # packed right shift <<25 172 pxor \TMP3, \TMP2 # xor the shifted versions 173 pxor \TMP4, \TMP2 174 movdqa \TMP2, \TMP5 175 psrldq $4, \TMP5 # right shift TMP5 1 DW 176 pslldq $12, \TMP2 # left shift TMP2 3 DWs 177 pxor \TMP2, \GH 178 179 # second phase of the reduction 180 181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 182 # in in order to perform 183 # independent shifts 184 movdqa \GH,\TMP3 185 movdqa \GH,\TMP4 186 psrld $1,\TMP2 # packed left shift >>1 187 psrld $2,\TMP3 # packed left shift >>2 188 psrld $7,\TMP4 # packed left shift >>7 189 pxor \TMP3,\TMP2 # xor the shifted versions 190 pxor \TMP4,\TMP2 191 pxor \TMP5, \TMP2 192 pxor \TMP2, \GH 193 pxor \TMP1, \GH # result is in TMP1 194.endm 195 196/* 197* if a = number of total plaintext bytes 198* b = floor(a/16) 199* num_initial_blocks = b mod 4 200* encrypt the initial num_initial_blocks blocks and apply ghash on 201* the ciphertext 202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 203* are clobbered 204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 205*/ 206 207 208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 210 mov arg7, %r10 # %r10 = AAD 211 mov arg8, %r12 # %r12 = aadLen 212 mov %r12, %r11 213 pxor %xmm\i, %xmm\i 214_get_AAD_loop\num_initial_blocks\operation: 215 movd (%r10), \TMP1 216 pslldq $12, \TMP1 217 psrldq $4, %xmm\i 218 pxor \TMP1, %xmm\i 219 add $4, %r10 220 sub $4, %r12 221 jne _get_AAD_loop\num_initial_blocks\operation 222 cmp $16, %r11 223 je _get_AAD_loop2_done\num_initial_blocks\operation 224 mov $16, %r12 225_get_AAD_loop2\num_initial_blocks\operation: 226 psrldq $4, %xmm\i 227 sub $4, %r12 228 cmp %r11, %r12 229 jne _get_AAD_loop2\num_initial_blocks\operation 230_get_AAD_loop2_done\num_initial_blocks\operation: 231 movdqa SHUF_MASK(%rip), %xmm14 232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 233 234 xor %r11, %r11 # initialise the data pointer offset as zero 235 236 # start AES for num_initial_blocks blocks 237 238 mov %arg5, %rax # %rax = *Y0 239 movdqu (%rax), \XMM0 # XMM0 = Y0 240 movdqa SHUF_MASK(%rip), %xmm14 241 PSHUFB_XMM %xmm14, \XMM0 242 243.if (\i == 5) || (\i == 6) || (\i == 7) 244.irpc index, \i_seq 245 paddd ONE(%rip), \XMM0 # INCR Y0 246 movdqa \XMM0, %xmm\index 247 movdqa SHUF_MASK(%rip), %xmm14 248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 249 250.endr 251.irpc index, \i_seq 252 pxor 16*0(%arg1), %xmm\index 253.endr 254.irpc index, \i_seq 255 movaps 0x10(%rdi), \TMP1 256 AESENC \TMP1, %xmm\index # Round 1 257.endr 258.irpc index, \i_seq 259 movaps 0x20(%arg1), \TMP1 260 AESENC \TMP1, %xmm\index # Round 2 261.endr 262.irpc index, \i_seq 263 movaps 0x30(%arg1), \TMP1 264 AESENC \TMP1, %xmm\index # Round 2 265.endr 266.irpc index, \i_seq 267 movaps 0x40(%arg1), \TMP1 268 AESENC \TMP1, %xmm\index # Round 2 269.endr 270.irpc index, \i_seq 271 movaps 0x50(%arg1), \TMP1 272 AESENC \TMP1, %xmm\index # Round 2 273.endr 274.irpc index, \i_seq 275 movaps 0x60(%arg1), \TMP1 276 AESENC \TMP1, %xmm\index # Round 2 277.endr 278.irpc index, \i_seq 279 movaps 0x70(%arg1), \TMP1 280 AESENC \TMP1, %xmm\index # Round 2 281.endr 282.irpc index, \i_seq 283 movaps 0x80(%arg1), \TMP1 284 AESENC \TMP1, %xmm\index # Round 2 285.endr 286.irpc index, \i_seq 287 movaps 0x90(%arg1), \TMP1 288 AESENC \TMP1, %xmm\index # Round 2 289.endr 290.irpc index, \i_seq 291 movaps 0xa0(%arg1), \TMP1 292 AESENCLAST \TMP1, %xmm\index # Round 10 293.endr 294.irpc index, \i_seq 295 movdqu (%arg3 , %r11, 1), \TMP1 296 pxor \TMP1, %xmm\index 297 movdqu %xmm\index, (%arg2 , %r11, 1) 298 # write back plaintext/ciphertext for num_initial_blocks 299 add $16, %r11 300 301 movdqa \TMP1, %xmm\index 302 movdqa SHUF_MASK(%rip), %xmm14 303 PSHUFB_XMM %xmm14, %xmm\index 304 305 # prepare plaintext/ciphertext for GHASH computation 306.endr 307.endif 308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 309 # apply GHASH on num_initial_blocks blocks 310 311.if \i == 5 312 pxor %xmm5, %xmm6 313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 314 pxor %xmm6, %xmm7 315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 316 pxor %xmm7, %xmm8 317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 318.elseif \i == 6 319 pxor %xmm6, %xmm7 320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 321 pxor %xmm7, %xmm8 322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 323.elseif \i == 7 324 pxor %xmm7, %xmm8 325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 326.endif 327 cmp $64, %r13 328 jl _initial_blocks_done\num_initial_blocks\operation 329 # no need for precomputed values 330/* 331* 332* Precomputations for HashKey parallel with encryption of first 4 blocks. 333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 334*/ 335 paddd ONE(%rip), \XMM0 # INCR Y0 336 movdqa \XMM0, \XMM1 337 movdqa SHUF_MASK(%rip), %xmm14 338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 339 340 paddd ONE(%rip), \XMM0 # INCR Y0 341 movdqa \XMM0, \XMM2 342 movdqa SHUF_MASK(%rip), %xmm14 343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 344 345 paddd ONE(%rip), \XMM0 # INCR Y0 346 movdqa \XMM0, \XMM3 347 movdqa SHUF_MASK(%rip), %xmm14 348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 349 350 paddd ONE(%rip), \XMM0 # INCR Y0 351 movdqa \XMM0, \XMM4 352 movdqa SHUF_MASK(%rip), %xmm14 353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 354 355 pxor 16*0(%arg1), \XMM1 356 pxor 16*0(%arg1), \XMM2 357 pxor 16*0(%arg1), \XMM3 358 pxor 16*0(%arg1), \XMM4 359 movdqa \TMP3, \TMP5 360 pshufd $78, \TMP3, \TMP1 361 pxor \TMP3, \TMP1 362 movdqa \TMP1, HashKey_k(%rsp) 363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 364# TMP5 = HashKey^2<<1 (mod poly) 365 movdqa \TMP5, HashKey_2(%rsp) 366# HashKey_2 = HashKey^2<<1 (mod poly) 367 pshufd $78, \TMP5, \TMP1 368 pxor \TMP5, \TMP1 369 movdqa \TMP1, HashKey_2_k(%rsp) 370.irpc index, 1234 # do 4 rounds 371 movaps 0x10*\index(%arg1), \TMP1 372 AESENC \TMP1, \XMM1 373 AESENC \TMP1, \XMM2 374 AESENC \TMP1, \XMM3 375 AESENC \TMP1, \XMM4 376.endr 377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 378# TMP5 = HashKey^3<<1 (mod poly) 379 movdqa \TMP5, HashKey_3(%rsp) 380 pshufd $78, \TMP5, \TMP1 381 pxor \TMP5, \TMP1 382 movdqa \TMP1, HashKey_3_k(%rsp) 383.irpc index, 56789 # do next 5 rounds 384 movaps 0x10*\index(%arg1), \TMP1 385 AESENC \TMP1, \XMM1 386 AESENC \TMP1, \XMM2 387 AESENC \TMP1, \XMM3 388 AESENC \TMP1, \XMM4 389.endr 390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 391# TMP5 = HashKey^3<<1 (mod poly) 392 movdqa \TMP5, HashKey_4(%rsp) 393 pshufd $78, \TMP5, \TMP1 394 pxor \TMP5, \TMP1 395 movdqa \TMP1, HashKey_4_k(%rsp) 396 movaps 0xa0(%arg1), \TMP2 397 AESENCLAST \TMP2, \XMM1 398 AESENCLAST \TMP2, \XMM2 399 AESENCLAST \TMP2, \XMM3 400 AESENCLAST \TMP2, \XMM4 401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 402 pxor \TMP1, \XMM1 403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 404 movdqa \TMP1, \XMM1 405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 406 pxor \TMP1, \XMM2 407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 408 movdqa \TMP1, \XMM2 409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 410 pxor \TMP1, \XMM3 411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 412 movdqa \TMP1, \XMM3 413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 414 pxor \TMP1, \XMM4 415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 416 movdqa \TMP1, \XMM4 417 add $64, %r11 418 movdqa SHUF_MASK(%rip), %xmm14 419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 420 pxor \XMMDst, \XMM1 421# combine GHASHed value with the corresponding ciphertext 422 movdqa SHUF_MASK(%rip), %xmm14 423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 424 movdqa SHUF_MASK(%rip), %xmm14 425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 426 movdqa SHUF_MASK(%rip), %xmm14 427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 428 429_initial_blocks_done\num_initial_blocks\operation: 430 431.endm 432 433 434/* 435* if a = number of total plaintext bytes 436* b = floor(a/16) 437* num_initial_blocks = b mod 4 438* encrypt the initial num_initial_blocks blocks and apply ghash on 439* the ciphertext 440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 441* are clobbered 442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 443*/ 444 445 446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 448 mov arg7, %r10 # %r10 = AAD 449 mov arg8, %r12 # %r12 = aadLen 450 mov %r12, %r11 451 pxor %xmm\i, %xmm\i 452_get_AAD_loop\num_initial_blocks\operation: 453 movd (%r10), \TMP1 454 pslldq $12, \TMP1 455 psrldq $4, %xmm\i 456 pxor \TMP1, %xmm\i 457 add $4, %r10 458 sub $4, %r12 459 jne _get_AAD_loop\num_initial_blocks\operation 460 cmp $16, %r11 461 je _get_AAD_loop2_done\num_initial_blocks\operation 462 mov $16, %r12 463_get_AAD_loop2\num_initial_blocks\operation: 464 psrldq $4, %xmm\i 465 sub $4, %r12 466 cmp %r11, %r12 467 jne _get_AAD_loop2\num_initial_blocks\operation 468_get_AAD_loop2_done\num_initial_blocks\operation: 469 movdqa SHUF_MASK(%rip), %xmm14 470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 471 472 xor %r11, %r11 # initialise the data pointer offset as zero 473 474 # start AES for num_initial_blocks blocks 475 476 mov %arg5, %rax # %rax = *Y0 477 movdqu (%rax), \XMM0 # XMM0 = Y0 478 movdqa SHUF_MASK(%rip), %xmm14 479 PSHUFB_XMM %xmm14, \XMM0 480 481.if (\i == 5) || (\i == 6) || (\i == 7) 482.irpc index, \i_seq 483 paddd ONE(%rip), \XMM0 # INCR Y0 484 movdqa \XMM0, %xmm\index 485 movdqa SHUF_MASK(%rip), %xmm14 486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 487 488.endr 489.irpc index, \i_seq 490 pxor 16*0(%arg1), %xmm\index 491.endr 492.irpc index, \i_seq 493 movaps 0x10(%rdi), \TMP1 494 AESENC \TMP1, %xmm\index # Round 1 495.endr 496.irpc index, \i_seq 497 movaps 0x20(%arg1), \TMP1 498 AESENC \TMP1, %xmm\index # Round 2 499.endr 500.irpc index, \i_seq 501 movaps 0x30(%arg1), \TMP1 502 AESENC \TMP1, %xmm\index # Round 2 503.endr 504.irpc index, \i_seq 505 movaps 0x40(%arg1), \TMP1 506 AESENC \TMP1, %xmm\index # Round 2 507.endr 508.irpc index, \i_seq 509 movaps 0x50(%arg1), \TMP1 510 AESENC \TMP1, %xmm\index # Round 2 511.endr 512.irpc index, \i_seq 513 movaps 0x60(%arg1), \TMP1 514 AESENC \TMP1, %xmm\index # Round 2 515.endr 516.irpc index, \i_seq 517 movaps 0x70(%arg1), \TMP1 518 AESENC \TMP1, %xmm\index # Round 2 519.endr 520.irpc index, \i_seq 521 movaps 0x80(%arg1), \TMP1 522 AESENC \TMP1, %xmm\index # Round 2 523.endr 524.irpc index, \i_seq 525 movaps 0x90(%arg1), \TMP1 526 AESENC \TMP1, %xmm\index # Round 2 527.endr 528.irpc index, \i_seq 529 movaps 0xa0(%arg1), \TMP1 530 AESENCLAST \TMP1, %xmm\index # Round 10 531.endr 532.irpc index, \i_seq 533 movdqu (%arg3 , %r11, 1), \TMP1 534 pxor \TMP1, %xmm\index 535 movdqu %xmm\index, (%arg2 , %r11, 1) 536 # write back plaintext/ciphertext for num_initial_blocks 537 add $16, %r11 538 539 movdqa SHUF_MASK(%rip), %xmm14 540 PSHUFB_XMM %xmm14, %xmm\index 541 542 # prepare plaintext/ciphertext for GHASH computation 543.endr 544.endif 545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 546 # apply GHASH on num_initial_blocks blocks 547 548.if \i == 5 549 pxor %xmm5, %xmm6 550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 551 pxor %xmm6, %xmm7 552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 553 pxor %xmm7, %xmm8 554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 555.elseif \i == 6 556 pxor %xmm6, %xmm7 557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 558 pxor %xmm7, %xmm8 559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 560.elseif \i == 7 561 pxor %xmm7, %xmm8 562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 563.endif 564 cmp $64, %r13 565 jl _initial_blocks_done\num_initial_blocks\operation 566 # no need for precomputed values 567/* 568* 569* Precomputations for HashKey parallel with encryption of first 4 blocks. 570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 571*/ 572 paddd ONE(%rip), \XMM0 # INCR Y0 573 movdqa \XMM0, \XMM1 574 movdqa SHUF_MASK(%rip), %xmm14 575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 576 577 paddd ONE(%rip), \XMM0 # INCR Y0 578 movdqa \XMM0, \XMM2 579 movdqa SHUF_MASK(%rip), %xmm14 580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 581 582 paddd ONE(%rip), \XMM0 # INCR Y0 583 movdqa \XMM0, \XMM3 584 movdqa SHUF_MASK(%rip), %xmm14 585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 586 587 paddd ONE(%rip), \XMM0 # INCR Y0 588 movdqa \XMM0, \XMM4 589 movdqa SHUF_MASK(%rip), %xmm14 590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 591 592 pxor 16*0(%arg1), \XMM1 593 pxor 16*0(%arg1), \XMM2 594 pxor 16*0(%arg1), \XMM3 595 pxor 16*0(%arg1), \XMM4 596 movdqa \TMP3, \TMP5 597 pshufd $78, \TMP3, \TMP1 598 pxor \TMP3, \TMP1 599 movdqa \TMP1, HashKey_k(%rsp) 600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 601# TMP5 = HashKey^2<<1 (mod poly) 602 movdqa \TMP5, HashKey_2(%rsp) 603# HashKey_2 = HashKey^2<<1 (mod poly) 604 pshufd $78, \TMP5, \TMP1 605 pxor \TMP5, \TMP1 606 movdqa \TMP1, HashKey_2_k(%rsp) 607.irpc index, 1234 # do 4 rounds 608 movaps 0x10*\index(%arg1), \TMP1 609 AESENC \TMP1, \XMM1 610 AESENC \TMP1, \XMM2 611 AESENC \TMP1, \XMM3 612 AESENC \TMP1, \XMM4 613.endr 614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 615# TMP5 = HashKey^3<<1 (mod poly) 616 movdqa \TMP5, HashKey_3(%rsp) 617 pshufd $78, \TMP5, \TMP1 618 pxor \TMP5, \TMP1 619 movdqa \TMP1, HashKey_3_k(%rsp) 620.irpc index, 56789 # do next 5 rounds 621 movaps 0x10*\index(%arg1), \TMP1 622 AESENC \TMP1, \XMM1 623 AESENC \TMP1, \XMM2 624 AESENC \TMP1, \XMM3 625 AESENC \TMP1, \XMM4 626.endr 627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 628# TMP5 = HashKey^3<<1 (mod poly) 629 movdqa \TMP5, HashKey_4(%rsp) 630 pshufd $78, \TMP5, \TMP1 631 pxor \TMP5, \TMP1 632 movdqa \TMP1, HashKey_4_k(%rsp) 633 movaps 0xa0(%arg1), \TMP2 634 AESENCLAST \TMP2, \XMM1 635 AESENCLAST \TMP2, \XMM2 636 AESENCLAST \TMP2, \XMM3 637 AESENCLAST \TMP2, \XMM4 638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 639 pxor \TMP1, \XMM1 640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 641 pxor \TMP1, \XMM2 642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 643 pxor \TMP1, \XMM3 644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 645 pxor \TMP1, \XMM4 646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 650 651 add $64, %r11 652 movdqa SHUF_MASK(%rip), %xmm14 653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 654 pxor \XMMDst, \XMM1 655# combine GHASHed value with the corresponding ciphertext 656 movdqa SHUF_MASK(%rip), %xmm14 657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 658 movdqa SHUF_MASK(%rip), %xmm14 659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 660 movdqa SHUF_MASK(%rip), %xmm14 661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 662 663_initial_blocks_done\num_initial_blocks\operation: 664 665.endm 666 667/* 668* encrypt 4 blocks at a time 669* ghash the 4 previously encrypted ciphertext blocks 670* arg1, %arg2, %arg3 are used as pointers only, not modified 671* %r11 is the data offset value 672*/ 673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 675 676 movdqa \XMM1, \XMM5 677 movdqa \XMM2, \XMM6 678 movdqa \XMM3, \XMM7 679 movdqa \XMM4, \XMM8 680 681 movdqa SHUF_MASK(%rip), %xmm15 682 # multiply TMP5 * HashKey using karatsuba 683 684 movdqa \XMM5, \TMP4 685 pshufd $78, \XMM5, \TMP6 686 pxor \XMM5, \TMP6 687 paddd ONE(%rip), \XMM0 # INCR CNT 688 movdqa HashKey_4(%rsp), \TMP5 689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 690 movdqa \XMM0, \XMM1 691 paddd ONE(%rip), \XMM0 # INCR CNT 692 movdqa \XMM0, \XMM2 693 paddd ONE(%rip), \XMM0 # INCR CNT 694 movdqa \XMM0, \XMM3 695 paddd ONE(%rip), \XMM0 # INCR CNT 696 movdqa \XMM0, \XMM4 697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 702 703 pxor (%arg1), \XMM1 704 pxor (%arg1), \XMM2 705 pxor (%arg1), \XMM3 706 pxor (%arg1), \XMM4 707 movdqa HashKey_4_k(%rsp), \TMP5 708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 709 movaps 0x10(%arg1), \TMP1 710 AESENC \TMP1, \XMM1 # Round 1 711 AESENC \TMP1, \XMM2 712 AESENC \TMP1, \XMM3 713 AESENC \TMP1, \XMM4 714 movaps 0x20(%arg1), \TMP1 715 AESENC \TMP1, \XMM1 # Round 2 716 AESENC \TMP1, \XMM2 717 AESENC \TMP1, \XMM3 718 AESENC \TMP1, \XMM4 719 movdqa \XMM6, \TMP1 720 pshufd $78, \XMM6, \TMP2 721 pxor \XMM6, \TMP2 722 movdqa HashKey_3(%rsp), \TMP5 723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 724 movaps 0x30(%arg1), \TMP3 725 AESENC \TMP3, \XMM1 # Round 3 726 AESENC \TMP3, \XMM2 727 AESENC \TMP3, \XMM3 728 AESENC \TMP3, \XMM4 729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 730 movaps 0x40(%arg1), \TMP3 731 AESENC \TMP3, \XMM1 # Round 4 732 AESENC \TMP3, \XMM2 733 AESENC \TMP3, \XMM3 734 AESENC \TMP3, \XMM4 735 movdqa HashKey_3_k(%rsp), \TMP5 736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 737 movaps 0x50(%arg1), \TMP3 738 AESENC \TMP3, \XMM1 # Round 5 739 AESENC \TMP3, \XMM2 740 AESENC \TMP3, \XMM3 741 AESENC \TMP3, \XMM4 742 pxor \TMP1, \TMP4 743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 744 pxor \XMM6, \XMM5 745 pxor \TMP2, \TMP6 746 movdqa \XMM7, \TMP1 747 pshufd $78, \XMM7, \TMP2 748 pxor \XMM7, \TMP2 749 movdqa HashKey_2(%rsp ), \TMP5 750 751 # Multiply TMP5 * HashKey using karatsuba 752 753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 754 movaps 0x60(%arg1), \TMP3 755 AESENC \TMP3, \XMM1 # Round 6 756 AESENC \TMP3, \XMM2 757 AESENC \TMP3, \XMM3 758 AESENC \TMP3, \XMM4 759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 760 movaps 0x70(%arg1), \TMP3 761 AESENC \TMP3, \XMM1 # Round 7 762 AESENC \TMP3, \XMM2 763 AESENC \TMP3, \XMM3 764 AESENC \TMP3, \XMM4 765 movdqa HashKey_2_k(%rsp), \TMP5 766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 767 movaps 0x80(%arg1), \TMP3 768 AESENC \TMP3, \XMM1 # Round 8 769 AESENC \TMP3, \XMM2 770 AESENC \TMP3, \XMM3 771 AESENC \TMP3, \XMM4 772 pxor \TMP1, \TMP4 773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 774 pxor \XMM7, \XMM5 775 pxor \TMP2, \TMP6 776 777 # Multiply XMM8 * HashKey 778 # XMM8 and TMP5 hold the values for the two operands 779 780 movdqa \XMM8, \TMP1 781 pshufd $78, \XMM8, \TMP2 782 pxor \XMM8, \TMP2 783 movdqa HashKey(%rsp), \TMP5 784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 785 movaps 0x90(%arg1), \TMP3 786 AESENC \TMP3, \XMM1 # Round 9 787 AESENC \TMP3, \XMM2 788 AESENC \TMP3, \XMM3 789 AESENC \TMP3, \XMM4 790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 791 movaps 0xa0(%arg1), \TMP3 792 AESENCLAST \TMP3, \XMM1 # Round 10 793 AESENCLAST \TMP3, \XMM2 794 AESENCLAST \TMP3, \XMM3 795 AESENCLAST \TMP3, \XMM4 796 movdqa HashKey_k(%rsp), \TMP5 797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 798 movdqu (%arg3,%r11,1), \TMP3 799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 800 movdqu 16(%arg3,%r11,1), \TMP3 801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 802 movdqu 32(%arg3,%r11,1), \TMP3 803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 804 movdqu 48(%arg3,%r11,1), \TMP3 805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 814 815 pxor \TMP4, \TMP1 816 pxor \XMM8, \XMM5 817 pxor \TMP6, \TMP2 818 pxor \TMP1, \TMP2 819 pxor \XMM5, \TMP2 820 movdqa \TMP2, \TMP3 821 pslldq $8, \TMP3 # left shift TMP3 2 DWs 822 psrldq $8, \TMP2 # right shift TMP2 2 DWs 823 pxor \TMP3, \XMM5 824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 825 826 # first phase of reduction 827 828 movdqa \XMM5, \TMP2 829 movdqa \XMM5, \TMP3 830 movdqa \XMM5, \TMP4 831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 832 pslld $31, \TMP2 # packed right shift << 31 833 pslld $30, \TMP3 # packed right shift << 30 834 pslld $25, \TMP4 # packed right shift << 25 835 pxor \TMP3, \TMP2 # xor the shifted versions 836 pxor \TMP4, \TMP2 837 movdqa \TMP2, \TMP5 838 psrldq $4, \TMP5 # right shift T5 1 DW 839 pslldq $12, \TMP2 # left shift T2 3 DWs 840 pxor \TMP2, \XMM5 841 842 # second phase of reduction 843 844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 845 movdqa \XMM5,\TMP3 846 movdqa \XMM5,\TMP4 847 psrld $1, \TMP2 # packed left shift >>1 848 psrld $2, \TMP3 # packed left shift >>2 849 psrld $7, \TMP4 # packed left shift >>7 850 pxor \TMP3,\TMP2 # xor the shifted versions 851 pxor \TMP4,\TMP2 852 pxor \TMP5, \TMP2 853 pxor \TMP2, \XMM5 854 pxor \TMP1, \XMM5 # result is in TMP1 855 856 pxor \XMM5, \XMM1 857.endm 858 859/* 860* decrypt 4 blocks at a time 861* ghash the 4 previously decrypted ciphertext blocks 862* arg1, %arg2, %arg3 are used as pointers only, not modified 863* %r11 is the data offset value 864*/ 865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 867 868 movdqa \XMM1, \XMM5 869 movdqa \XMM2, \XMM6 870 movdqa \XMM3, \XMM7 871 movdqa \XMM4, \XMM8 872 873 movdqa SHUF_MASK(%rip), %xmm15 874 # multiply TMP5 * HashKey using karatsuba 875 876 movdqa \XMM5, \TMP4 877 pshufd $78, \XMM5, \TMP6 878 pxor \XMM5, \TMP6 879 paddd ONE(%rip), \XMM0 # INCR CNT 880 movdqa HashKey_4(%rsp), \TMP5 881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 882 movdqa \XMM0, \XMM1 883 paddd ONE(%rip), \XMM0 # INCR CNT 884 movdqa \XMM0, \XMM2 885 paddd ONE(%rip), \XMM0 # INCR CNT 886 movdqa \XMM0, \XMM3 887 paddd ONE(%rip), \XMM0 # INCR CNT 888 movdqa \XMM0, \XMM4 889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 894 895 pxor (%arg1), \XMM1 896 pxor (%arg1), \XMM2 897 pxor (%arg1), \XMM3 898 pxor (%arg1), \XMM4 899 movdqa HashKey_4_k(%rsp), \TMP5 900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 901 movaps 0x10(%arg1), \TMP1 902 AESENC \TMP1, \XMM1 # Round 1 903 AESENC \TMP1, \XMM2 904 AESENC \TMP1, \XMM3 905 AESENC \TMP1, \XMM4 906 movaps 0x20(%arg1), \TMP1 907 AESENC \TMP1, \XMM1 # Round 2 908 AESENC \TMP1, \XMM2 909 AESENC \TMP1, \XMM3 910 AESENC \TMP1, \XMM4 911 movdqa \XMM6, \TMP1 912 pshufd $78, \XMM6, \TMP2 913 pxor \XMM6, \TMP2 914 movdqa HashKey_3(%rsp), \TMP5 915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 916 movaps 0x30(%arg1), \TMP3 917 AESENC \TMP3, \XMM1 # Round 3 918 AESENC \TMP3, \XMM2 919 AESENC \TMP3, \XMM3 920 AESENC \TMP3, \XMM4 921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 922 movaps 0x40(%arg1), \TMP3 923 AESENC \TMP3, \XMM1 # Round 4 924 AESENC \TMP3, \XMM2 925 AESENC \TMP3, \XMM3 926 AESENC \TMP3, \XMM4 927 movdqa HashKey_3_k(%rsp), \TMP5 928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 929 movaps 0x50(%arg1), \TMP3 930 AESENC \TMP3, \XMM1 # Round 5 931 AESENC \TMP3, \XMM2 932 AESENC \TMP3, \XMM3 933 AESENC \TMP3, \XMM4 934 pxor \TMP1, \TMP4 935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 936 pxor \XMM6, \XMM5 937 pxor \TMP2, \TMP6 938 movdqa \XMM7, \TMP1 939 pshufd $78, \XMM7, \TMP2 940 pxor \XMM7, \TMP2 941 movdqa HashKey_2(%rsp ), \TMP5 942 943 # Multiply TMP5 * HashKey using karatsuba 944 945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 946 movaps 0x60(%arg1), \TMP3 947 AESENC \TMP3, \XMM1 # Round 6 948 AESENC \TMP3, \XMM2 949 AESENC \TMP3, \XMM3 950 AESENC \TMP3, \XMM4 951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 952 movaps 0x70(%arg1), \TMP3 953 AESENC \TMP3, \XMM1 # Round 7 954 AESENC \TMP3, \XMM2 955 AESENC \TMP3, \XMM3 956 AESENC \TMP3, \XMM4 957 movdqa HashKey_2_k(%rsp), \TMP5 958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 959 movaps 0x80(%arg1), \TMP3 960 AESENC \TMP3, \XMM1 # Round 8 961 AESENC \TMP3, \XMM2 962 AESENC \TMP3, \XMM3 963 AESENC \TMP3, \XMM4 964 pxor \TMP1, \TMP4 965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 966 pxor \XMM7, \XMM5 967 pxor \TMP2, \TMP6 968 969 # Multiply XMM8 * HashKey 970 # XMM8 and TMP5 hold the values for the two operands 971 972 movdqa \XMM8, \TMP1 973 pshufd $78, \XMM8, \TMP2 974 pxor \XMM8, \TMP2 975 movdqa HashKey(%rsp), \TMP5 976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 977 movaps 0x90(%arg1), \TMP3 978 AESENC \TMP3, \XMM1 # Round 9 979 AESENC \TMP3, \XMM2 980 AESENC \TMP3, \XMM3 981 AESENC \TMP3, \XMM4 982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 983 movaps 0xa0(%arg1), \TMP3 984 AESENCLAST \TMP3, \XMM1 # Round 10 985 AESENCLAST \TMP3, \XMM2 986 AESENCLAST \TMP3, \XMM3 987 AESENCLAST \TMP3, \XMM4 988 movdqa HashKey_k(%rsp), \TMP5 989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 990 movdqu (%arg3,%r11,1), \TMP3 991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 993 movdqa \TMP3, \XMM1 994 movdqu 16(%arg3,%r11,1), \TMP3 995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 997 movdqa \TMP3, \XMM2 998 movdqu 32(%arg3,%r11,1), \TMP3 999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1001 movdqa \TMP3, \XMM3 1002 movdqu 48(%arg3,%r11,1), \TMP3 1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1005 movdqa \TMP3, \XMM4 1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1010 1011 pxor \TMP4, \TMP1 1012 pxor \XMM8, \XMM5 1013 pxor \TMP6, \TMP2 1014 pxor \TMP1, \TMP2 1015 pxor \XMM5, \TMP2 1016 movdqa \TMP2, \TMP3 1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1019 pxor \TMP3, \XMM5 1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1021 1022 # first phase of reduction 1023 1024 movdqa \XMM5, \TMP2 1025 movdqa \XMM5, \TMP3 1026 movdqa \XMM5, \TMP4 1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1028 pslld $31, \TMP2 # packed right shift << 31 1029 pslld $30, \TMP3 # packed right shift << 30 1030 pslld $25, \TMP4 # packed right shift << 25 1031 pxor \TMP3, \TMP2 # xor the shifted versions 1032 pxor \TMP4, \TMP2 1033 movdqa \TMP2, \TMP5 1034 psrldq $4, \TMP5 # right shift T5 1 DW 1035 pslldq $12, \TMP2 # left shift T2 3 DWs 1036 pxor \TMP2, \XMM5 1037 1038 # second phase of reduction 1039 1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1041 movdqa \XMM5,\TMP3 1042 movdqa \XMM5,\TMP4 1043 psrld $1, \TMP2 # packed left shift >>1 1044 psrld $2, \TMP3 # packed left shift >>2 1045 psrld $7, \TMP4 # packed left shift >>7 1046 pxor \TMP3,\TMP2 # xor the shifted versions 1047 pxor \TMP4,\TMP2 1048 pxor \TMP5, \TMP2 1049 pxor \TMP2, \XMM5 1050 pxor \TMP1, \XMM5 # result is in TMP1 1051 1052 pxor \XMM5, \XMM1 1053.endm 1054 1055/* GHASH the last 4 ciphertext blocks. */ 1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1058 1059 # Multiply TMP6 * HashKey (using Karatsuba) 1060 1061 movdqa \XMM1, \TMP6 1062 pshufd $78, \XMM1, \TMP2 1063 pxor \XMM1, \TMP2 1064 movdqa HashKey_4(%rsp), \TMP5 1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1067 movdqa HashKey_4_k(%rsp), \TMP4 1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1069 movdqa \XMM1, \XMMDst 1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1071 1072 # Multiply TMP1 * HashKey (using Karatsuba) 1073 1074 movdqa \XMM2, \TMP1 1075 pshufd $78, \XMM2, \TMP2 1076 pxor \XMM2, \TMP2 1077 movdqa HashKey_3(%rsp), \TMP5 1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1080 movdqa HashKey_3_k(%rsp), \TMP4 1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1082 pxor \TMP1, \TMP6 1083 pxor \XMM2, \XMMDst 1084 pxor \TMP2, \XMM1 1085# results accumulated in TMP6, XMMDst, XMM1 1086 1087 # Multiply TMP1 * HashKey (using Karatsuba) 1088 1089 movdqa \XMM3, \TMP1 1090 pshufd $78, \XMM3, \TMP2 1091 pxor \XMM3, \TMP2 1092 movdqa HashKey_2(%rsp), \TMP5 1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1095 movdqa HashKey_2_k(%rsp), \TMP4 1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1097 pxor \TMP1, \TMP6 1098 pxor \XMM3, \XMMDst 1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1100 1101 # Multiply TMP1 * HashKey (using Karatsuba) 1102 movdqa \XMM4, \TMP1 1103 pshufd $78, \XMM4, \TMP2 1104 pxor \XMM4, \TMP2 1105 movdqa HashKey(%rsp), \TMP5 1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1108 movdqa HashKey_k(%rsp), \TMP4 1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1110 pxor \TMP1, \TMP6 1111 pxor \XMM4, \XMMDst 1112 pxor \XMM1, \TMP2 1113 pxor \TMP6, \TMP2 1114 pxor \XMMDst, \TMP2 1115 # middle section of the temp results combined as in karatsuba algorithm 1116 movdqa \TMP2, \TMP4 1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1119 pxor \TMP4, \XMMDst 1120 pxor \TMP2, \TMP6 1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1122 # first phase of the reduction 1123 movdqa \XMMDst, \TMP2 1124 movdqa \XMMDst, \TMP3 1125 movdqa \XMMDst, \TMP4 1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1127 pslld $31, \TMP2 # packed right shifting << 31 1128 pslld $30, \TMP3 # packed right shifting << 30 1129 pslld $25, \TMP4 # packed right shifting << 25 1130 pxor \TMP3, \TMP2 # xor the shifted versions 1131 pxor \TMP4, \TMP2 1132 movdqa \TMP2, \TMP7 1133 psrldq $4, \TMP7 # right shift TMP7 1 DW 1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1135 pxor \TMP2, \XMMDst 1136 1137 # second phase of the reduction 1138 movdqa \XMMDst, \TMP2 1139 # make 3 copies of XMMDst for doing 3 shift operations 1140 movdqa \XMMDst, \TMP3 1141 movdqa \XMMDst, \TMP4 1142 psrld $1, \TMP2 # packed left shift >> 1 1143 psrld $2, \TMP3 # packed left shift >> 2 1144 psrld $7, \TMP4 # packed left shift >> 7 1145 pxor \TMP3, \TMP2 # xor the shifted versions 1146 pxor \TMP4, \TMP2 1147 pxor \TMP7, \TMP2 1148 pxor \TMP2, \XMMDst 1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1150.endm 1151 1152/* Encryption of a single block done*/ 1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1154 1155 pxor (%arg1), \XMM0 1156 movaps 16(%arg1), \TMP1 1157 AESENC \TMP1, \XMM0 1158 movaps 32(%arg1), \TMP1 1159 AESENC \TMP1, \XMM0 1160 movaps 48(%arg1), \TMP1 1161 AESENC \TMP1, \XMM0 1162 movaps 64(%arg1), \TMP1 1163 AESENC \TMP1, \XMM0 1164 movaps 80(%arg1), \TMP1 1165 AESENC \TMP1, \XMM0 1166 movaps 96(%arg1), \TMP1 1167 AESENC \TMP1, \XMM0 1168 movaps 112(%arg1), \TMP1 1169 AESENC \TMP1, \XMM0 1170 movaps 128(%arg1), \TMP1 1171 AESENC \TMP1, \XMM0 1172 movaps 144(%arg1), \TMP1 1173 AESENC \TMP1, \XMM0 1174 movaps 160(%arg1), \TMP1 1175 AESENCLAST \TMP1, \XMM0 1176.endm 1177 1178 1179/***************************************************************************** 1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1181* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1182* const u8 *in, // Ciphertext input 1183* u64 plaintext_len, // Length of data in bytes for decryption. 1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1186* // concatenated with 0x00000001. 16-byte aligned pointer. 1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1188* const u8 *aad, // Additional Authentication Data (AAD) 1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1191* // given authentication tag and only return the plaintext if they match. 1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1193* // (most likely), 12 or 8. 1194* 1195* Assumptions: 1196* 1197* keys: 1198* keys are pre-expanded and aligned to 16 bytes. we are using the first 1199* set of 11 keys in the data structure void *aes_ctx 1200* 1201* iv: 1202* 0 1 2 3 1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1205* | Salt (From the SA) | 1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1207* | Initialization Vector | 1208* | (This is the sequence number from IPSec header) | 1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1210* | 0x1 | 1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1212* 1213* 1214* 1215* AAD: 1216* AAD padded to 128 bits with 0 1217* for example, assume AAD is a u32 vector 1218* 1219* if AAD is 8 bytes: 1220* AAD[3] = {A0, A1}; 1221* padded AAD in xmm register = {A1 A0 0 0} 1222* 1223* 0 1 2 3 1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1226* | SPI (A1) | 1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1228* | 32-bit Sequence Number (A0) | 1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1230* | 0x0 | 1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1232* 1233* AAD Format with 32-bit Sequence Number 1234* 1235* if AAD is 12 bytes: 1236* AAD[3] = {A0, A1, A2}; 1237* padded AAD in xmm register = {A2 A1 A0 0} 1238* 1239* 0 1 2 3 1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1244* | SPI (A2) | 1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1246* | 64-bit Extended Sequence Number {A1,A0} | 1247* | | 1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1249* | 0x0 | 1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1251* 1252* AAD Format with 64-bit Extended Sequence Number 1253* 1254* aadLen: 1255* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1256* The code supports 16 too but for other sizes, the code will fail. 1257* 1258* TLen: 1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1260* For other sizes, the code will fail. 1261* 1262* poly = x^128 + x^127 + x^126 + x^121 + 1 1263* 1264*****************************************************************************/ 1265 1266ENTRY(aesni_gcm_dec) 1267 push %r12 1268 push %r13 1269 push %r14 1270 mov %rsp, %r14 1271/* 1272* states of %xmm registers %xmm6:%xmm15 not saved 1273* all %xmm registers are clobbered 1274*/ 1275 sub $VARIABLE_OFFSET, %rsp 1276 and $~63, %rsp # align rsp to 64 bytes 1277 mov %arg6, %r12 1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1279 movdqa SHUF_MASK(%rip), %xmm2 1280 PSHUFB_XMM %xmm2, %xmm13 1281 1282 1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1284 1285 movdqa %xmm13, %xmm2 1286 psllq $1, %xmm13 1287 psrlq $63, %xmm2 1288 movdqa %xmm2, %xmm1 1289 pslldq $8, %xmm2 1290 psrldq $8, %xmm1 1291 por %xmm2, %xmm13 1292 1293 # Reduction 1294 1295 pshufd $0x24, %xmm1, %xmm2 1296 pcmpeqd TWOONE(%rip), %xmm2 1297 pand POLY(%rip), %xmm2 1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1299 1300 1301 # Decrypt first few blocks 1302 1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1306 mov %r13, %r12 1307 and $(3<<4), %r12 1308 jz _initial_num_blocks_is_0_decrypt 1309 cmp $(2<<4), %r12 1310 jb _initial_num_blocks_is_1_decrypt 1311 je _initial_num_blocks_is_2_decrypt 1312_initial_num_blocks_is_3_decrypt: 1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1315 sub $48, %r13 1316 jmp _initial_blocks_decrypted 1317_initial_num_blocks_is_2_decrypt: 1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1320 sub $32, %r13 1321 jmp _initial_blocks_decrypted 1322_initial_num_blocks_is_1_decrypt: 1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1325 sub $16, %r13 1326 jmp _initial_blocks_decrypted 1327_initial_num_blocks_is_0_decrypt: 1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1330_initial_blocks_decrypted: 1331 cmp $0, %r13 1332 je _zero_cipher_left_decrypt 1333 sub $64, %r13 1334 je _four_cipher_left_decrypt 1335_decrypt_by_4: 1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1338 add $64, %r11 1339 sub $64, %r13 1340 jne _decrypt_by_4 1341_four_cipher_left_decrypt: 1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1344_zero_cipher_left_decrypt: 1345 mov %arg4, %r13 1346 and $15, %r13 # %r13 = arg4 (mod 16) 1347 je _multiple_of_16_bytes_decrypt 1348 1349 # Handle the last <16 byte block separately 1350 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1352 movdqa SHUF_MASK(%rip), %xmm10 1353 PSHUFB_XMM %xmm10, %xmm0 1354 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1356 sub $16, %r11 1357 add %r13, %r11 1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1359 lea SHIFT_MASK+16(%rip), %r12 1360 sub %r13, %r12 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1362# (%r13 is the number of bytes in plaintext mod 16) 1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1365 1366 movdqa %xmm1, %xmm2 1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1371 pand %xmm1, %xmm2 1372 movdqa SHUF_MASK(%rip), %xmm10 1373 PSHUFB_XMM %xmm10 ,%xmm2 1374 1375 pxor %xmm2, %xmm8 1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1377 # GHASH computation for the last <16 byte block 1378 sub %r13, %r11 1379 add $16, %r11 1380 1381 # output %r13 bytes 1382 MOVQ_R64_XMM %xmm0, %rax 1383 cmp $8, %r13 1384 jle _less_than_8_bytes_left_decrypt 1385 mov %rax, (%arg2 , %r11, 1) 1386 add $8, %r11 1387 psrldq $8, %xmm0 1388 MOVQ_R64_XMM %xmm0, %rax 1389 sub $8, %r13 1390_less_than_8_bytes_left_decrypt: 1391 mov %al, (%arg2, %r11, 1) 1392 add $1, %r11 1393 shr $8, %rax 1394 sub $1, %r13 1395 jne _less_than_8_bytes_left_decrypt 1396_multiple_of_16_bytes_decrypt: 1397 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1398 shl $3, %r12 # convert into number of bits 1399 movd %r12d, %xmm15 # len(A) in %xmm15 1400 shl $3, %arg4 # len(C) in bits (*128) 1401 MOVQ_R64_XMM %arg4, %xmm1 1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1404 pxor %xmm15, %xmm8 1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1406 # final GHASH computation 1407 movdqa SHUF_MASK(%rip), %xmm10 1408 PSHUFB_XMM %xmm10, %xmm8 1409 1410 mov %arg5, %rax # %rax = *Y0 1411 movdqu (%rax), %xmm0 # %xmm0 = Y0 1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1413 pxor %xmm8, %xmm0 1414_return_T_decrypt: 1415 mov arg9, %r10 # %r10 = authTag 1416 mov arg10, %r11 # %r11 = auth_tag_len 1417 cmp $16, %r11 1418 je _T_16_decrypt 1419 cmp $12, %r11 1420 je _T_12_decrypt 1421_T_8_decrypt: 1422 MOVQ_R64_XMM %xmm0, %rax 1423 mov %rax, (%r10) 1424 jmp _return_T_done_decrypt 1425_T_12_decrypt: 1426 MOVQ_R64_XMM %xmm0, %rax 1427 mov %rax, (%r10) 1428 psrldq $8, %xmm0 1429 movd %xmm0, %eax 1430 mov %eax, 8(%r10) 1431 jmp _return_T_done_decrypt 1432_T_16_decrypt: 1433 movdqu %xmm0, (%r10) 1434_return_T_done_decrypt: 1435 mov %r14, %rsp 1436 pop %r14 1437 pop %r13 1438 pop %r12 1439 ret 1440 1441 1442/***************************************************************************** 1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1445* const u8 *in, // Plaintext input 1446* u64 plaintext_len, // Length of data in bytes for encryption. 1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1449* // concatenated with 0x00000001. 16-byte aligned pointer. 1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1451* const u8 *aad, // Additional Authentication Data (AAD) 1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1453* u8 *auth_tag, // Authenticated Tag output. 1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1455* // 12 or 8. 1456* 1457* Assumptions: 1458* 1459* keys: 1460* keys are pre-expanded and aligned to 16 bytes. we are using the 1461* first set of 11 keys in the data structure void *aes_ctx 1462* 1463* 1464* iv: 1465* 0 1 2 3 1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1468* | Salt (From the SA) | 1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1470* | Initialization Vector | 1471* | (This is the sequence number from IPSec header) | 1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1473* | 0x1 | 1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1475* 1476* 1477* 1478* AAD: 1479* AAD padded to 128 bits with 0 1480* for example, assume AAD is a u32 vector 1481* 1482* if AAD is 8 bytes: 1483* AAD[3] = {A0, A1}; 1484* padded AAD in xmm register = {A1 A0 0 0} 1485* 1486* 0 1 2 3 1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1489* | SPI (A1) | 1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1491* | 32-bit Sequence Number (A0) | 1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1493* | 0x0 | 1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1495* 1496* AAD Format with 32-bit Sequence Number 1497* 1498* if AAD is 12 bytes: 1499* AAD[3] = {A0, A1, A2}; 1500* padded AAD in xmm register = {A2 A1 A0 0} 1501* 1502* 0 1 2 3 1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1505* | SPI (A2) | 1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1507* | 64-bit Extended Sequence Number {A1,A0} | 1508* | | 1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1510* | 0x0 | 1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1512* 1513* AAD Format with 64-bit Extended Sequence Number 1514* 1515* aadLen: 1516* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1517* The code supports 16 too but for other sizes, the code will fail. 1518* 1519* TLen: 1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1521* For other sizes, the code will fail. 1522* 1523* poly = x^128 + x^127 + x^126 + x^121 + 1 1524***************************************************************************/ 1525ENTRY(aesni_gcm_enc) 1526 push %r12 1527 push %r13 1528 push %r14 1529 mov %rsp, %r14 1530# 1531# states of %xmm registers %xmm6:%xmm15 not saved 1532# all %xmm registers are clobbered 1533# 1534 sub $VARIABLE_OFFSET, %rsp 1535 and $~63, %rsp 1536 mov %arg6, %r12 1537 movdqu (%r12), %xmm13 1538 movdqa SHUF_MASK(%rip), %xmm2 1539 PSHUFB_XMM %xmm2, %xmm13 1540 1541 1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1543 1544 movdqa %xmm13, %xmm2 1545 psllq $1, %xmm13 1546 psrlq $63, %xmm2 1547 movdqa %xmm2, %xmm1 1548 pslldq $8, %xmm2 1549 psrldq $8, %xmm1 1550 por %xmm2, %xmm13 1551 1552 # reduce HashKey<<1 1553 1554 pshufd $0x24, %xmm1, %xmm2 1555 pcmpeqd TWOONE(%rip), %xmm2 1556 pand POLY(%rip), %xmm2 1557 pxor %xmm2, %xmm13 1558 movdqa %xmm13, HashKey(%rsp) 1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1560 and $-16, %r13 1561 mov %r13, %r12 1562 1563 # Encrypt first few blocks 1564 1565 and $(3<<4), %r12 1566 jz _initial_num_blocks_is_0_encrypt 1567 cmp $(2<<4), %r12 1568 jb _initial_num_blocks_is_1_encrypt 1569 je _initial_num_blocks_is_2_encrypt 1570_initial_num_blocks_is_3_encrypt: 1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1573 sub $48, %r13 1574 jmp _initial_blocks_encrypted 1575_initial_num_blocks_is_2_encrypt: 1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1578 sub $32, %r13 1579 jmp _initial_blocks_encrypted 1580_initial_num_blocks_is_1_encrypt: 1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1583 sub $16, %r13 1584 jmp _initial_blocks_encrypted 1585_initial_num_blocks_is_0_encrypt: 1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1588_initial_blocks_encrypted: 1589 1590 # Main loop - Encrypt remaining blocks 1591 1592 cmp $0, %r13 1593 je _zero_cipher_left_encrypt 1594 sub $64, %r13 1595 je _four_cipher_left_encrypt 1596_encrypt_by_4_encrypt: 1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1599 add $64, %r11 1600 sub $64, %r13 1601 jne _encrypt_by_4_encrypt 1602_four_cipher_left_encrypt: 1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1605_zero_cipher_left_encrypt: 1606 mov %arg4, %r13 1607 and $15, %r13 # %r13 = arg4 (mod 16) 1608 je _multiple_of_16_bytes_encrypt 1609 1610 # Handle the last <16 Byte block separately 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1612 movdqa SHUF_MASK(%rip), %xmm10 1613 PSHUFB_XMM %xmm10, %xmm0 1614 1615 1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1617 sub $16, %r11 1618 add %r13, %r11 1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1620 lea SHIFT_MASK+16(%rip), %r12 1621 sub %r13, %r12 1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1623 # (%r13 is the number of bytes in plaintext mod 16) 1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1630 movdqa SHUF_MASK(%rip), %xmm10 1631 PSHUFB_XMM %xmm10,%xmm0 1632 1633 pxor %xmm0, %xmm8 1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1635 # GHASH computation for the last <16 byte block 1636 sub %r13, %r11 1637 add $16, %r11 1638 1639 movdqa SHUF_MASK(%rip), %xmm10 1640 PSHUFB_XMM %xmm10, %xmm0 1641 1642 # shuffle xmm0 back to output as ciphertext 1643 1644 # Output %r13 bytes 1645 MOVQ_R64_XMM %xmm0, %rax 1646 cmp $8, %r13 1647 jle _less_than_8_bytes_left_encrypt 1648 mov %rax, (%arg2 , %r11, 1) 1649 add $8, %r11 1650 psrldq $8, %xmm0 1651 MOVQ_R64_XMM %xmm0, %rax 1652 sub $8, %r13 1653_less_than_8_bytes_left_encrypt: 1654 mov %al, (%arg2, %r11, 1) 1655 add $1, %r11 1656 shr $8, %rax 1657 sub $1, %r13 1658 jne _less_than_8_bytes_left_encrypt 1659_multiple_of_16_bytes_encrypt: 1660 mov arg8, %r12 # %r12 = addLen (number of bytes) 1661 shl $3, %r12 1662 movd %r12d, %xmm15 # len(A) in %xmm15 1663 shl $3, %arg4 # len(C) in bits (*128) 1664 MOVQ_R64_XMM %arg4, %xmm1 1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1667 pxor %xmm15, %xmm8 1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1669 # final GHASH computation 1670 movdqa SHUF_MASK(%rip), %xmm10 1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1672 1673 mov %arg5, %rax # %rax = *Y0 1674 movdqu (%rax), %xmm0 # %xmm0 = Y0 1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1676 pxor %xmm8, %xmm0 1677_return_T_encrypt: 1678 mov arg9, %r10 # %r10 = authTag 1679 mov arg10, %r11 # %r11 = auth_tag_len 1680 cmp $16, %r11 1681 je _T_16_encrypt 1682 cmp $12, %r11 1683 je _T_12_encrypt 1684_T_8_encrypt: 1685 MOVQ_R64_XMM %xmm0, %rax 1686 mov %rax, (%r10) 1687 jmp _return_T_done_encrypt 1688_T_12_encrypt: 1689 MOVQ_R64_XMM %xmm0, %rax 1690 mov %rax, (%r10) 1691 psrldq $8, %xmm0 1692 movd %xmm0, %eax 1693 mov %eax, 8(%r10) 1694 jmp _return_T_done_encrypt 1695_T_16_encrypt: 1696 movdqu %xmm0, (%r10) 1697_return_T_done_encrypt: 1698 mov %r14, %rsp 1699 pop %r14 1700 pop %r13 1701 pop %r12 1702 ret 1703 1704#endif 1705 1706 1707_key_expansion_128: 1708_key_expansion_256a: 1709 pshufd $0b11111111, %xmm1, %xmm1 1710 shufps $0b00010000, %xmm0, %xmm4 1711 pxor %xmm4, %xmm0 1712 shufps $0b10001100, %xmm0, %xmm4 1713 pxor %xmm4, %xmm0 1714 pxor %xmm1, %xmm0 1715 movaps %xmm0, (TKEYP) 1716 add $0x10, TKEYP 1717 ret 1718 1719.align 4 1720_key_expansion_192a: 1721 pshufd $0b01010101, %xmm1, %xmm1 1722 shufps $0b00010000, %xmm0, %xmm4 1723 pxor %xmm4, %xmm0 1724 shufps $0b10001100, %xmm0, %xmm4 1725 pxor %xmm4, %xmm0 1726 pxor %xmm1, %xmm0 1727 1728 movaps %xmm2, %xmm5 1729 movaps %xmm2, %xmm6 1730 pslldq $4, %xmm5 1731 pshufd $0b11111111, %xmm0, %xmm3 1732 pxor %xmm3, %xmm2 1733 pxor %xmm5, %xmm2 1734 1735 movaps %xmm0, %xmm1 1736 shufps $0b01000100, %xmm0, %xmm6 1737 movaps %xmm6, (TKEYP) 1738 shufps $0b01001110, %xmm2, %xmm1 1739 movaps %xmm1, 0x10(TKEYP) 1740 add $0x20, TKEYP 1741 ret 1742 1743.align 4 1744_key_expansion_192b: 1745 pshufd $0b01010101, %xmm1, %xmm1 1746 shufps $0b00010000, %xmm0, %xmm4 1747 pxor %xmm4, %xmm0 1748 shufps $0b10001100, %xmm0, %xmm4 1749 pxor %xmm4, %xmm0 1750 pxor %xmm1, %xmm0 1751 1752 movaps %xmm2, %xmm5 1753 pslldq $4, %xmm5 1754 pshufd $0b11111111, %xmm0, %xmm3 1755 pxor %xmm3, %xmm2 1756 pxor %xmm5, %xmm2 1757 1758 movaps %xmm0, (TKEYP) 1759 add $0x10, TKEYP 1760 ret 1761 1762.align 4 1763_key_expansion_256b: 1764 pshufd $0b10101010, %xmm1, %xmm1 1765 shufps $0b00010000, %xmm2, %xmm4 1766 pxor %xmm4, %xmm2 1767 shufps $0b10001100, %xmm2, %xmm4 1768 pxor %xmm4, %xmm2 1769 pxor %xmm1, %xmm2 1770 movaps %xmm2, (TKEYP) 1771 add $0x10, TKEYP 1772 ret 1773 1774/* 1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1776 * unsigned int key_len) 1777 */ 1778ENTRY(aesni_set_key) 1779#ifndef __x86_64__ 1780 pushl KEYP 1781 movl 8(%esp), KEYP # ctx 1782 movl 12(%esp), UKEYP # in_key 1783 movl 16(%esp), %edx # key_len 1784#endif 1785 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1786 movaps %xmm0, (KEYP) 1787 lea 0x10(KEYP), TKEYP # key addr 1788 movl %edx, 480(KEYP) 1789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1790 cmp $24, %dl 1791 jb .Lenc_key128 1792 je .Lenc_key192 1793 movups 0x10(UKEYP), %xmm2 # other user key 1794 movaps %xmm2, (TKEYP) 1795 add $0x10, TKEYP 1796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1797 call _key_expansion_256a 1798 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1799 call _key_expansion_256b 1800 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1801 call _key_expansion_256a 1802 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1803 call _key_expansion_256b 1804 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1805 call _key_expansion_256a 1806 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1807 call _key_expansion_256b 1808 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1809 call _key_expansion_256a 1810 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1811 call _key_expansion_256b 1812 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1813 call _key_expansion_256a 1814 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1815 call _key_expansion_256b 1816 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1817 call _key_expansion_256a 1818 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1819 call _key_expansion_256b 1820 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1821 call _key_expansion_256a 1822 jmp .Ldec_key 1823.Lenc_key192: 1824 movq 0x10(UKEYP), %xmm2 # other user key 1825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1826 call _key_expansion_192a 1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1828 call _key_expansion_192b 1829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1830 call _key_expansion_192a 1831 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1832 call _key_expansion_192b 1833 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1834 call _key_expansion_192a 1835 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1836 call _key_expansion_192b 1837 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1838 call _key_expansion_192a 1839 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1840 call _key_expansion_192b 1841 jmp .Ldec_key 1842.Lenc_key128: 1843 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1844 call _key_expansion_128 1845 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1846 call _key_expansion_128 1847 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1848 call _key_expansion_128 1849 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1850 call _key_expansion_128 1851 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1852 call _key_expansion_128 1853 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1854 call _key_expansion_128 1855 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1856 call _key_expansion_128 1857 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1858 call _key_expansion_128 1859 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1860 call _key_expansion_128 1861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1862 call _key_expansion_128 1863.Ldec_key: 1864 sub $0x10, TKEYP 1865 movaps (KEYP), %xmm0 1866 movaps (TKEYP), %xmm1 1867 movaps %xmm0, 240(TKEYP) 1868 movaps %xmm1, 240(KEYP) 1869 add $0x10, KEYP 1870 lea 240-16(TKEYP), UKEYP 1871.align 4 1872.Ldec_key_loop: 1873 movaps (KEYP), %xmm0 1874 AESIMC %xmm0 %xmm1 1875 movaps %xmm1, (UKEYP) 1876 add $0x10, KEYP 1877 sub $0x10, UKEYP 1878 cmp TKEYP, KEYP 1879 jb .Ldec_key_loop 1880 xor AREG, AREG 1881#ifndef __x86_64__ 1882 popl KEYP 1883#endif 1884 ret 1885 1886/* 1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1888 */ 1889ENTRY(aesni_enc) 1890#ifndef __x86_64__ 1891 pushl KEYP 1892 pushl KLEN 1893 movl 12(%esp), KEYP 1894 movl 16(%esp), OUTP 1895 movl 20(%esp), INP 1896#endif 1897 movl 480(KEYP), KLEN # key length 1898 movups (INP), STATE # input 1899 call _aesni_enc1 1900 movups STATE, (OUTP) # output 1901#ifndef __x86_64__ 1902 popl KLEN 1903 popl KEYP 1904#endif 1905 ret 1906 1907/* 1908 * _aesni_enc1: internal ABI 1909 * input: 1910 * KEYP: key struct pointer 1911 * KLEN: round count 1912 * STATE: initial state (input) 1913 * output: 1914 * STATE: finial state (output) 1915 * changed: 1916 * KEY 1917 * TKEYP (T1) 1918 */ 1919.align 4 1920_aesni_enc1: 1921 movaps (KEYP), KEY # key 1922 mov KEYP, TKEYP 1923 pxor KEY, STATE # round 0 1924 add $0x30, TKEYP 1925 cmp $24, KLEN 1926 jb .Lenc128 1927 lea 0x20(TKEYP), TKEYP 1928 je .Lenc192 1929 add $0x20, TKEYP 1930 movaps -0x60(TKEYP), KEY 1931 AESENC KEY STATE 1932 movaps -0x50(TKEYP), KEY 1933 AESENC KEY STATE 1934.align 4 1935.Lenc192: 1936 movaps -0x40(TKEYP), KEY 1937 AESENC KEY STATE 1938 movaps -0x30(TKEYP), KEY 1939 AESENC KEY STATE 1940.align 4 1941.Lenc128: 1942 movaps -0x20(TKEYP), KEY 1943 AESENC KEY STATE 1944 movaps -0x10(TKEYP), KEY 1945 AESENC KEY STATE 1946 movaps (TKEYP), KEY 1947 AESENC KEY STATE 1948 movaps 0x10(TKEYP), KEY 1949 AESENC KEY STATE 1950 movaps 0x20(TKEYP), KEY 1951 AESENC KEY STATE 1952 movaps 0x30(TKEYP), KEY 1953 AESENC KEY STATE 1954 movaps 0x40(TKEYP), KEY 1955 AESENC KEY STATE 1956 movaps 0x50(TKEYP), KEY 1957 AESENC KEY STATE 1958 movaps 0x60(TKEYP), KEY 1959 AESENC KEY STATE 1960 movaps 0x70(TKEYP), KEY 1961 AESENCLAST KEY STATE 1962 ret 1963 1964/* 1965 * _aesni_enc4: internal ABI 1966 * input: 1967 * KEYP: key struct pointer 1968 * KLEN: round count 1969 * STATE1: initial state (input) 1970 * STATE2 1971 * STATE3 1972 * STATE4 1973 * output: 1974 * STATE1: finial state (output) 1975 * STATE2 1976 * STATE3 1977 * STATE4 1978 * changed: 1979 * KEY 1980 * TKEYP (T1) 1981 */ 1982.align 4 1983_aesni_enc4: 1984 movaps (KEYP), KEY # key 1985 mov KEYP, TKEYP 1986 pxor KEY, STATE1 # round 0 1987 pxor KEY, STATE2 1988 pxor KEY, STATE3 1989 pxor KEY, STATE4 1990 add $0x30, TKEYP 1991 cmp $24, KLEN 1992 jb .L4enc128 1993 lea 0x20(TKEYP), TKEYP 1994 je .L4enc192 1995 add $0x20, TKEYP 1996 movaps -0x60(TKEYP), KEY 1997 AESENC KEY STATE1 1998 AESENC KEY STATE2 1999 AESENC KEY STATE3 2000 AESENC KEY STATE4 2001 movaps -0x50(TKEYP), KEY 2002 AESENC KEY STATE1 2003 AESENC KEY STATE2 2004 AESENC KEY STATE3 2005 AESENC KEY STATE4 2006#.align 4 2007.L4enc192: 2008 movaps -0x40(TKEYP), KEY 2009 AESENC KEY STATE1 2010 AESENC KEY STATE2 2011 AESENC KEY STATE3 2012 AESENC KEY STATE4 2013 movaps -0x30(TKEYP), KEY 2014 AESENC KEY STATE1 2015 AESENC KEY STATE2 2016 AESENC KEY STATE3 2017 AESENC KEY STATE4 2018#.align 4 2019.L4enc128: 2020 movaps -0x20(TKEYP), KEY 2021 AESENC KEY STATE1 2022 AESENC KEY STATE2 2023 AESENC KEY STATE3 2024 AESENC KEY STATE4 2025 movaps -0x10(TKEYP), KEY 2026 AESENC KEY STATE1 2027 AESENC KEY STATE2 2028 AESENC KEY STATE3 2029 AESENC KEY STATE4 2030 movaps (TKEYP), KEY 2031 AESENC KEY STATE1 2032 AESENC KEY STATE2 2033 AESENC KEY STATE3 2034 AESENC KEY STATE4 2035 movaps 0x10(TKEYP), KEY 2036 AESENC KEY STATE1 2037 AESENC KEY STATE2 2038 AESENC KEY STATE3 2039 AESENC KEY STATE4 2040 movaps 0x20(TKEYP), KEY 2041 AESENC KEY STATE1 2042 AESENC KEY STATE2 2043 AESENC KEY STATE3 2044 AESENC KEY STATE4 2045 movaps 0x30(TKEYP), KEY 2046 AESENC KEY STATE1 2047 AESENC KEY STATE2 2048 AESENC KEY STATE3 2049 AESENC KEY STATE4 2050 movaps 0x40(TKEYP), KEY 2051 AESENC KEY STATE1 2052 AESENC KEY STATE2 2053 AESENC KEY STATE3 2054 AESENC KEY STATE4 2055 movaps 0x50(TKEYP), KEY 2056 AESENC KEY STATE1 2057 AESENC KEY STATE2 2058 AESENC KEY STATE3 2059 AESENC KEY STATE4 2060 movaps 0x60(TKEYP), KEY 2061 AESENC KEY STATE1 2062 AESENC KEY STATE2 2063 AESENC KEY STATE3 2064 AESENC KEY STATE4 2065 movaps 0x70(TKEYP), KEY 2066 AESENCLAST KEY STATE1 # last round 2067 AESENCLAST KEY STATE2 2068 AESENCLAST KEY STATE3 2069 AESENCLAST KEY STATE4 2070 ret 2071 2072/* 2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2074 */ 2075ENTRY(aesni_dec) 2076#ifndef __x86_64__ 2077 pushl KEYP 2078 pushl KLEN 2079 movl 12(%esp), KEYP 2080 movl 16(%esp), OUTP 2081 movl 20(%esp), INP 2082#endif 2083 mov 480(KEYP), KLEN # key length 2084 add $240, KEYP 2085 movups (INP), STATE # input 2086 call _aesni_dec1 2087 movups STATE, (OUTP) #output 2088#ifndef __x86_64__ 2089 popl KLEN 2090 popl KEYP 2091#endif 2092 ret 2093 2094/* 2095 * _aesni_dec1: internal ABI 2096 * input: 2097 * KEYP: key struct pointer 2098 * KLEN: key length 2099 * STATE: initial state (input) 2100 * output: 2101 * STATE: finial state (output) 2102 * changed: 2103 * KEY 2104 * TKEYP (T1) 2105 */ 2106.align 4 2107_aesni_dec1: 2108 movaps (KEYP), KEY # key 2109 mov KEYP, TKEYP 2110 pxor KEY, STATE # round 0 2111 add $0x30, TKEYP 2112 cmp $24, KLEN 2113 jb .Ldec128 2114 lea 0x20(TKEYP), TKEYP 2115 je .Ldec192 2116 add $0x20, TKEYP 2117 movaps -0x60(TKEYP), KEY 2118 AESDEC KEY STATE 2119 movaps -0x50(TKEYP), KEY 2120 AESDEC KEY STATE 2121.align 4 2122.Ldec192: 2123 movaps -0x40(TKEYP), KEY 2124 AESDEC KEY STATE 2125 movaps -0x30(TKEYP), KEY 2126 AESDEC KEY STATE 2127.align 4 2128.Ldec128: 2129 movaps -0x20(TKEYP), KEY 2130 AESDEC KEY STATE 2131 movaps -0x10(TKEYP), KEY 2132 AESDEC KEY STATE 2133 movaps (TKEYP), KEY 2134 AESDEC KEY STATE 2135 movaps 0x10(TKEYP), KEY 2136 AESDEC KEY STATE 2137 movaps 0x20(TKEYP), KEY 2138 AESDEC KEY STATE 2139 movaps 0x30(TKEYP), KEY 2140 AESDEC KEY STATE 2141 movaps 0x40(TKEYP), KEY 2142 AESDEC KEY STATE 2143 movaps 0x50(TKEYP), KEY 2144 AESDEC KEY STATE 2145 movaps 0x60(TKEYP), KEY 2146 AESDEC KEY STATE 2147 movaps 0x70(TKEYP), KEY 2148 AESDECLAST KEY STATE 2149 ret 2150 2151/* 2152 * _aesni_dec4: internal ABI 2153 * input: 2154 * KEYP: key struct pointer 2155 * KLEN: key length 2156 * STATE1: initial state (input) 2157 * STATE2 2158 * STATE3 2159 * STATE4 2160 * output: 2161 * STATE1: finial state (output) 2162 * STATE2 2163 * STATE3 2164 * STATE4 2165 * changed: 2166 * KEY 2167 * TKEYP (T1) 2168 */ 2169.align 4 2170_aesni_dec4: 2171 movaps (KEYP), KEY # key 2172 mov KEYP, TKEYP 2173 pxor KEY, STATE1 # round 0 2174 pxor KEY, STATE2 2175 pxor KEY, STATE3 2176 pxor KEY, STATE4 2177 add $0x30, TKEYP 2178 cmp $24, KLEN 2179 jb .L4dec128 2180 lea 0x20(TKEYP), TKEYP 2181 je .L4dec192 2182 add $0x20, TKEYP 2183 movaps -0x60(TKEYP), KEY 2184 AESDEC KEY STATE1 2185 AESDEC KEY STATE2 2186 AESDEC KEY STATE3 2187 AESDEC KEY STATE4 2188 movaps -0x50(TKEYP), KEY 2189 AESDEC KEY STATE1 2190 AESDEC KEY STATE2 2191 AESDEC KEY STATE3 2192 AESDEC KEY STATE4 2193.align 4 2194.L4dec192: 2195 movaps -0x40(TKEYP), KEY 2196 AESDEC KEY STATE1 2197 AESDEC KEY STATE2 2198 AESDEC KEY STATE3 2199 AESDEC KEY STATE4 2200 movaps -0x30(TKEYP), KEY 2201 AESDEC KEY STATE1 2202 AESDEC KEY STATE2 2203 AESDEC KEY STATE3 2204 AESDEC KEY STATE4 2205.align 4 2206.L4dec128: 2207 movaps -0x20(TKEYP), KEY 2208 AESDEC KEY STATE1 2209 AESDEC KEY STATE2 2210 AESDEC KEY STATE3 2211 AESDEC KEY STATE4 2212 movaps -0x10(TKEYP), KEY 2213 AESDEC KEY STATE1 2214 AESDEC KEY STATE2 2215 AESDEC KEY STATE3 2216 AESDEC KEY STATE4 2217 movaps (TKEYP), KEY 2218 AESDEC KEY STATE1 2219 AESDEC KEY STATE2 2220 AESDEC KEY STATE3 2221 AESDEC KEY STATE4 2222 movaps 0x10(TKEYP), KEY 2223 AESDEC KEY STATE1 2224 AESDEC KEY STATE2 2225 AESDEC KEY STATE3 2226 AESDEC KEY STATE4 2227 movaps 0x20(TKEYP), KEY 2228 AESDEC KEY STATE1 2229 AESDEC KEY STATE2 2230 AESDEC KEY STATE3 2231 AESDEC KEY STATE4 2232 movaps 0x30(TKEYP), KEY 2233 AESDEC KEY STATE1 2234 AESDEC KEY STATE2 2235 AESDEC KEY STATE3 2236 AESDEC KEY STATE4 2237 movaps 0x40(TKEYP), KEY 2238 AESDEC KEY STATE1 2239 AESDEC KEY STATE2 2240 AESDEC KEY STATE3 2241 AESDEC KEY STATE4 2242 movaps 0x50(TKEYP), KEY 2243 AESDEC KEY STATE1 2244 AESDEC KEY STATE2 2245 AESDEC KEY STATE3 2246 AESDEC KEY STATE4 2247 movaps 0x60(TKEYP), KEY 2248 AESDEC KEY STATE1 2249 AESDEC KEY STATE2 2250 AESDEC KEY STATE3 2251 AESDEC KEY STATE4 2252 movaps 0x70(TKEYP), KEY 2253 AESDECLAST KEY STATE1 # last round 2254 AESDECLAST KEY STATE2 2255 AESDECLAST KEY STATE3 2256 AESDECLAST KEY STATE4 2257 ret 2258 2259/* 2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2261 * size_t len) 2262 */ 2263ENTRY(aesni_ecb_enc) 2264#ifndef __x86_64__ 2265 pushl LEN 2266 pushl KEYP 2267 pushl KLEN 2268 movl 16(%esp), KEYP 2269 movl 20(%esp), OUTP 2270 movl 24(%esp), INP 2271 movl 28(%esp), LEN 2272#endif 2273 test LEN, LEN # check length 2274 jz .Lecb_enc_ret 2275 mov 480(KEYP), KLEN 2276 cmp $16, LEN 2277 jb .Lecb_enc_ret 2278 cmp $64, LEN 2279 jb .Lecb_enc_loop1 2280.align 4 2281.Lecb_enc_loop4: 2282 movups (INP), STATE1 2283 movups 0x10(INP), STATE2 2284 movups 0x20(INP), STATE3 2285 movups 0x30(INP), STATE4 2286 call _aesni_enc4 2287 movups STATE1, (OUTP) 2288 movups STATE2, 0x10(OUTP) 2289 movups STATE3, 0x20(OUTP) 2290 movups STATE4, 0x30(OUTP) 2291 sub $64, LEN 2292 add $64, INP 2293 add $64, OUTP 2294 cmp $64, LEN 2295 jge .Lecb_enc_loop4 2296 cmp $16, LEN 2297 jb .Lecb_enc_ret 2298.align 4 2299.Lecb_enc_loop1: 2300 movups (INP), STATE1 2301 call _aesni_enc1 2302 movups STATE1, (OUTP) 2303 sub $16, LEN 2304 add $16, INP 2305 add $16, OUTP 2306 cmp $16, LEN 2307 jge .Lecb_enc_loop1 2308.Lecb_enc_ret: 2309#ifndef __x86_64__ 2310 popl KLEN 2311 popl KEYP 2312 popl LEN 2313#endif 2314 ret 2315 2316/* 2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2318 * size_t len); 2319 */ 2320ENTRY(aesni_ecb_dec) 2321#ifndef __x86_64__ 2322 pushl LEN 2323 pushl KEYP 2324 pushl KLEN 2325 movl 16(%esp), KEYP 2326 movl 20(%esp), OUTP 2327 movl 24(%esp), INP 2328 movl 28(%esp), LEN 2329#endif 2330 test LEN, LEN 2331 jz .Lecb_dec_ret 2332 mov 480(KEYP), KLEN 2333 add $240, KEYP 2334 cmp $16, LEN 2335 jb .Lecb_dec_ret 2336 cmp $64, LEN 2337 jb .Lecb_dec_loop1 2338.align 4 2339.Lecb_dec_loop4: 2340 movups (INP), STATE1 2341 movups 0x10(INP), STATE2 2342 movups 0x20(INP), STATE3 2343 movups 0x30(INP), STATE4 2344 call _aesni_dec4 2345 movups STATE1, (OUTP) 2346 movups STATE2, 0x10(OUTP) 2347 movups STATE3, 0x20(OUTP) 2348 movups STATE4, 0x30(OUTP) 2349 sub $64, LEN 2350 add $64, INP 2351 add $64, OUTP 2352 cmp $64, LEN 2353 jge .Lecb_dec_loop4 2354 cmp $16, LEN 2355 jb .Lecb_dec_ret 2356.align 4 2357.Lecb_dec_loop1: 2358 movups (INP), STATE1 2359 call _aesni_dec1 2360 movups STATE1, (OUTP) 2361 sub $16, LEN 2362 add $16, INP 2363 add $16, OUTP 2364 cmp $16, LEN 2365 jge .Lecb_dec_loop1 2366.Lecb_dec_ret: 2367#ifndef __x86_64__ 2368 popl KLEN 2369 popl KEYP 2370 popl LEN 2371#endif 2372 ret 2373 2374/* 2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2376 * size_t len, u8 *iv) 2377 */ 2378ENTRY(aesni_cbc_enc) 2379#ifndef __x86_64__ 2380 pushl IVP 2381 pushl LEN 2382 pushl KEYP 2383 pushl KLEN 2384 movl 20(%esp), KEYP 2385 movl 24(%esp), OUTP 2386 movl 28(%esp), INP 2387 movl 32(%esp), LEN 2388 movl 36(%esp), IVP 2389#endif 2390 cmp $16, LEN 2391 jb .Lcbc_enc_ret 2392 mov 480(KEYP), KLEN 2393 movups (IVP), STATE # load iv as initial state 2394.align 4 2395.Lcbc_enc_loop: 2396 movups (INP), IN # load input 2397 pxor IN, STATE 2398 call _aesni_enc1 2399 movups STATE, (OUTP) # store output 2400 sub $16, LEN 2401 add $16, INP 2402 add $16, OUTP 2403 cmp $16, LEN 2404 jge .Lcbc_enc_loop 2405 movups STATE, (IVP) 2406.Lcbc_enc_ret: 2407#ifndef __x86_64__ 2408 popl KLEN 2409 popl KEYP 2410 popl LEN 2411 popl IVP 2412#endif 2413 ret 2414 2415/* 2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2417 * size_t len, u8 *iv) 2418 */ 2419ENTRY(aesni_cbc_dec) 2420#ifndef __x86_64__ 2421 pushl IVP 2422 pushl LEN 2423 pushl KEYP 2424 pushl KLEN 2425 movl 20(%esp), KEYP 2426 movl 24(%esp), OUTP 2427 movl 28(%esp), INP 2428 movl 32(%esp), LEN 2429 movl 36(%esp), IVP 2430#endif 2431 cmp $16, LEN 2432 jb .Lcbc_dec_just_ret 2433 mov 480(KEYP), KLEN 2434 add $240, KEYP 2435 movups (IVP), IV 2436 cmp $64, LEN 2437 jb .Lcbc_dec_loop1 2438.align 4 2439.Lcbc_dec_loop4: 2440 movups (INP), IN1 2441 movaps IN1, STATE1 2442 movups 0x10(INP), IN2 2443 movaps IN2, STATE2 2444#ifdef __x86_64__ 2445 movups 0x20(INP), IN3 2446 movaps IN3, STATE3 2447 movups 0x30(INP), IN4 2448 movaps IN4, STATE4 2449#else 2450 movups 0x20(INP), IN1 2451 movaps IN1, STATE3 2452 movups 0x30(INP), IN2 2453 movaps IN2, STATE4 2454#endif 2455 call _aesni_dec4 2456 pxor IV, STATE1 2457#ifdef __x86_64__ 2458 pxor IN1, STATE2 2459 pxor IN2, STATE3 2460 pxor IN3, STATE4 2461 movaps IN4, IV 2462#else 2463 pxor (INP), STATE2 2464 pxor 0x10(INP), STATE3 2465 pxor IN1, STATE4 2466 movaps IN2, IV 2467#endif 2468 movups STATE1, (OUTP) 2469 movups STATE2, 0x10(OUTP) 2470 movups STATE3, 0x20(OUTP) 2471 movups STATE4, 0x30(OUTP) 2472 sub $64, LEN 2473 add $64, INP 2474 add $64, OUTP 2475 cmp $64, LEN 2476 jge .Lcbc_dec_loop4 2477 cmp $16, LEN 2478 jb .Lcbc_dec_ret 2479.align 4 2480.Lcbc_dec_loop1: 2481 movups (INP), IN 2482 movaps IN, STATE 2483 call _aesni_dec1 2484 pxor IV, STATE 2485 movups STATE, (OUTP) 2486 movaps IN, IV 2487 sub $16, LEN 2488 add $16, INP 2489 add $16, OUTP 2490 cmp $16, LEN 2491 jge .Lcbc_dec_loop1 2492.Lcbc_dec_ret: 2493 movups IV, (IVP) 2494.Lcbc_dec_just_ret: 2495#ifndef __x86_64__ 2496 popl KLEN 2497 popl KEYP 2498 popl LEN 2499 popl IVP 2500#endif 2501 ret 2502 2503#ifdef __x86_64__ 2504.align 16 2505.Lbswap_mask: 2506 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2507 2508/* 2509 * _aesni_inc_init: internal ABI 2510 * setup registers used by _aesni_inc 2511 * input: 2512 * IV 2513 * output: 2514 * CTR: == IV, in little endian 2515 * TCTR_LOW: == lower qword of CTR 2516 * INC: == 1, in little endian 2517 * BSWAP_MASK == endian swapping mask 2518 */ 2519.align 4 2520_aesni_inc_init: 2521 movaps .Lbswap_mask, BSWAP_MASK 2522 movaps IV, CTR 2523 PSHUFB_XMM BSWAP_MASK CTR 2524 mov $1, TCTR_LOW 2525 MOVQ_R64_XMM TCTR_LOW INC 2526 MOVQ_R64_XMM CTR TCTR_LOW 2527 ret 2528 2529/* 2530 * _aesni_inc: internal ABI 2531 * Increase IV by 1, IV is in big endian 2532 * input: 2533 * IV 2534 * CTR: == IV, in little endian 2535 * TCTR_LOW: == lower qword of CTR 2536 * INC: == 1, in little endian 2537 * BSWAP_MASK == endian swapping mask 2538 * output: 2539 * IV: Increase by 1 2540 * changed: 2541 * CTR: == output IV, in little endian 2542 * TCTR_LOW: == lower qword of CTR 2543 */ 2544.align 4 2545_aesni_inc: 2546 paddq INC, CTR 2547 add $1, TCTR_LOW 2548 jnc .Linc_low 2549 pslldq $8, INC 2550 paddq INC, CTR 2551 psrldq $8, INC 2552.Linc_low: 2553 movaps CTR, IV 2554 PSHUFB_XMM BSWAP_MASK IV 2555 ret 2556 2557/* 2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2559 * size_t len, u8 *iv) 2560 */ 2561ENTRY(aesni_ctr_enc) 2562 cmp $16, LEN 2563 jb .Lctr_enc_just_ret 2564 mov 480(KEYP), KLEN 2565 movups (IVP), IV 2566 call _aesni_inc_init 2567 cmp $64, LEN 2568 jb .Lctr_enc_loop1 2569.align 4 2570.Lctr_enc_loop4: 2571 movaps IV, STATE1 2572 call _aesni_inc 2573 movups (INP), IN1 2574 movaps IV, STATE2 2575 call _aesni_inc 2576 movups 0x10(INP), IN2 2577 movaps IV, STATE3 2578 call _aesni_inc 2579 movups 0x20(INP), IN3 2580 movaps IV, STATE4 2581 call _aesni_inc 2582 movups 0x30(INP), IN4 2583 call _aesni_enc4 2584 pxor IN1, STATE1 2585 movups STATE1, (OUTP) 2586 pxor IN2, STATE2 2587 movups STATE2, 0x10(OUTP) 2588 pxor IN3, STATE3 2589 movups STATE3, 0x20(OUTP) 2590 pxor IN4, STATE4 2591 movups STATE4, 0x30(OUTP) 2592 sub $64, LEN 2593 add $64, INP 2594 add $64, OUTP 2595 cmp $64, LEN 2596 jge .Lctr_enc_loop4 2597 cmp $16, LEN 2598 jb .Lctr_enc_ret 2599.align 4 2600.Lctr_enc_loop1: 2601 movaps IV, STATE 2602 call _aesni_inc 2603 movups (INP), IN 2604 call _aesni_enc1 2605 pxor IN, STATE 2606 movups STATE, (OUTP) 2607 sub $16, LEN 2608 add $16, INP 2609 add $16, OUTP 2610 cmp $16, LEN 2611 jge .Lctr_enc_loop1 2612.Lctr_enc_ret: 2613 movups IV, (IVP) 2614.Lctr_enc_just_ret: 2615 ret 2616#endif 2617