1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34 35#ifdef __x86_64__ 36.data 37POLY: .octa 0xC2000000000000000000000000000001 38TWOONE: .octa 0x00000001000000000000000000000001 39 40# order of these constants should not change. 41# more specifically, ALL_F should follow SHIFT_MASK, 42# and ZERO should follow ALL_F 43 44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 45MASK1: .octa 0x0000000000000000ffffffffffffffff 46MASK2: .octa 0xffffffffffffffff0000000000000000 47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 48ALL_F: .octa 0xffffffffffffffffffffffffffffffff 49ZERO: .octa 0x00000000000000000000000000000000 50ONE: .octa 0x00000000000000000000000000000001 51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 52dec: .octa 0x1 53enc: .octa 0x2 54 55 56.text 57 58 59#define STACK_OFFSET 8*3 60#define HashKey 16*0 // store HashKey <<1 mod poly here 61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 65 // bits of HashKey <<1 mod poly here 66 //(for Karatsuba purposes) 67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 68 // bits of HashKey^2 <<1 mod poly here 69 // (for Karatsuba purposes) 70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 71 // bits of HashKey^3 <<1 mod poly here 72 // (for Karatsuba purposes) 73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 74 // bits of HashKey^4 <<1 mod poly here 75 // (for Karatsuba purposes) 76#define VARIABLE_OFFSET 16*8 77 78#define arg1 rdi 79#define arg2 rsi 80#define arg3 rdx 81#define arg4 rcx 82#define arg5 r8 83#define arg6 r9 84#define arg7 STACK_OFFSET+8(%r14) 85#define arg8 STACK_OFFSET+16(%r14) 86#define arg9 STACK_OFFSET+24(%r14) 87#define arg10 STACK_OFFSET+32(%r14) 88#endif 89 90 91#define STATE1 %xmm0 92#define STATE2 %xmm4 93#define STATE3 %xmm5 94#define STATE4 %xmm6 95#define STATE STATE1 96#define IN1 %xmm1 97#define IN2 %xmm7 98#define IN3 %xmm8 99#define IN4 %xmm9 100#define IN IN1 101#define KEY %xmm2 102#define IV %xmm3 103 104#define BSWAP_MASK %xmm10 105#define CTR %xmm11 106#define INC %xmm12 107 108#ifdef __x86_64__ 109#define AREG %rax 110#define KEYP %rdi 111#define OUTP %rsi 112#define UKEYP OUTP 113#define INP %rdx 114#define LEN %rcx 115#define IVP %r8 116#define KLEN %r9d 117#define T1 %r10 118#define TKEYP T1 119#define T2 %r11 120#define TCTR_LOW T2 121#else 122#define AREG %eax 123#define KEYP %edi 124#define OUTP AREG 125#define UKEYP OUTP 126#define INP %edx 127#define LEN %esi 128#define IVP %ebp 129#define KLEN %ebx 130#define T1 %ecx 131#define TKEYP T1 132#endif 133 134 135#ifdef __x86_64__ 136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 137* 138* 139* Input: A and B (128-bits each, bit-reflected) 140* Output: C = A*B*x mod poly, (i.e. >>1 ) 141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 143* 144*/ 145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 146 movdqa \GH, \TMP1 147 pshufd $78, \GH, \TMP2 148 pshufd $78, \HK, \TMP3 149 pxor \GH, \TMP2 # TMP2 = a1+a0 150 pxor \HK, \TMP3 # TMP3 = b1+b0 151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 154 pxor \GH, \TMP2 155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 156 movdqa \TMP2, \TMP3 157 pslldq $8, \TMP3 # left shift TMP3 2 DWs 158 psrldq $8, \TMP2 # right shift TMP2 2 DWs 159 pxor \TMP3, \GH 160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 161 162 # first phase of the reduction 163 164 movdqa \GH, \TMP2 165 movdqa \GH, \TMP3 166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 167 # in in order to perform 168 # independent shifts 169 pslld $31, \TMP2 # packed right shift <<31 170 pslld $30, \TMP3 # packed right shift <<30 171 pslld $25, \TMP4 # packed right shift <<25 172 pxor \TMP3, \TMP2 # xor the shifted versions 173 pxor \TMP4, \TMP2 174 movdqa \TMP2, \TMP5 175 psrldq $4, \TMP5 # right shift TMP5 1 DW 176 pslldq $12, \TMP2 # left shift TMP2 3 DWs 177 pxor \TMP2, \GH 178 179 # second phase of the reduction 180 181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 182 # in in order to perform 183 # independent shifts 184 movdqa \GH,\TMP3 185 movdqa \GH,\TMP4 186 psrld $1,\TMP2 # packed left shift >>1 187 psrld $2,\TMP3 # packed left shift >>2 188 psrld $7,\TMP4 # packed left shift >>7 189 pxor \TMP3,\TMP2 # xor the shifted versions 190 pxor \TMP4,\TMP2 191 pxor \TMP5, \TMP2 192 pxor \TMP2, \GH 193 pxor \TMP1, \GH # result is in TMP1 194.endm 195 196/* 197* if a = number of total plaintext bytes 198* b = floor(a/16) 199* num_initial_blocks = b mod 4 200* encrypt the initial num_initial_blocks blocks and apply ghash on 201* the ciphertext 202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 203* are clobbered 204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 205*/ 206 207 208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 210 mov arg7, %r10 # %r10 = AAD 211 mov arg8, %r12 # %r12 = aadLen 212 mov %r12, %r11 213 pxor %xmm\i, %xmm\i 214_get_AAD_loop\num_initial_blocks\operation: 215 movd (%r10), \TMP1 216 pslldq $12, \TMP1 217 psrldq $4, %xmm\i 218 pxor \TMP1, %xmm\i 219 add $4, %r10 220 sub $4, %r12 221 jne _get_AAD_loop\num_initial_blocks\operation 222 cmp $16, %r11 223 je _get_AAD_loop2_done\num_initial_blocks\operation 224 mov $16, %r12 225_get_AAD_loop2\num_initial_blocks\operation: 226 psrldq $4, %xmm\i 227 sub $4, %r12 228 cmp %r11, %r12 229 jne _get_AAD_loop2\num_initial_blocks\operation 230_get_AAD_loop2_done\num_initial_blocks\operation: 231 movdqa SHUF_MASK(%rip), %xmm14 232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 233 234 xor %r11, %r11 # initialise the data pointer offset as zero 235 236 # start AES for num_initial_blocks blocks 237 238 mov %arg5, %rax # %rax = *Y0 239 movdqu (%rax), \XMM0 # XMM0 = Y0 240 movdqa SHUF_MASK(%rip), %xmm14 241 PSHUFB_XMM %xmm14, \XMM0 242 243.if (\i == 5) || (\i == 6) || (\i == 7) 244.irpc index, \i_seq 245 paddd ONE(%rip), \XMM0 # INCR Y0 246 movdqa \XMM0, %xmm\index 247 movdqa SHUF_MASK(%rip), %xmm14 248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 249 250.endr 251.irpc index, \i_seq 252 pxor 16*0(%arg1), %xmm\index 253.endr 254.irpc index, \i_seq 255 movaps 0x10(%rdi), \TMP1 256 AESENC \TMP1, %xmm\index # Round 1 257.endr 258.irpc index, \i_seq 259 movaps 0x20(%arg1), \TMP1 260 AESENC \TMP1, %xmm\index # Round 2 261.endr 262.irpc index, \i_seq 263 movaps 0x30(%arg1), \TMP1 264 AESENC \TMP1, %xmm\index # Round 2 265.endr 266.irpc index, \i_seq 267 movaps 0x40(%arg1), \TMP1 268 AESENC \TMP1, %xmm\index # Round 2 269.endr 270.irpc index, \i_seq 271 movaps 0x50(%arg1), \TMP1 272 AESENC \TMP1, %xmm\index # Round 2 273.endr 274.irpc index, \i_seq 275 movaps 0x60(%arg1), \TMP1 276 AESENC \TMP1, %xmm\index # Round 2 277.endr 278.irpc index, \i_seq 279 movaps 0x70(%arg1), \TMP1 280 AESENC \TMP1, %xmm\index # Round 2 281.endr 282.irpc index, \i_seq 283 movaps 0x80(%arg1), \TMP1 284 AESENC \TMP1, %xmm\index # Round 2 285.endr 286.irpc index, \i_seq 287 movaps 0x90(%arg1), \TMP1 288 AESENC \TMP1, %xmm\index # Round 2 289.endr 290.irpc index, \i_seq 291 movaps 0xa0(%arg1), \TMP1 292 AESENCLAST \TMP1, %xmm\index # Round 10 293.endr 294.irpc index, \i_seq 295 movdqu (%arg3 , %r11, 1), \TMP1 296 pxor \TMP1, %xmm\index 297 movdqu %xmm\index, (%arg2 , %r11, 1) 298 # write back plaintext/ciphertext for num_initial_blocks 299 add $16, %r11 300 301 movdqa \TMP1, %xmm\index 302 movdqa SHUF_MASK(%rip), %xmm14 303 PSHUFB_XMM %xmm14, %xmm\index 304 305 # prepare plaintext/ciphertext for GHASH computation 306.endr 307.endif 308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 309 # apply GHASH on num_initial_blocks blocks 310 311.if \i == 5 312 pxor %xmm5, %xmm6 313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 314 pxor %xmm6, %xmm7 315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 316 pxor %xmm7, %xmm8 317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 318.elseif \i == 6 319 pxor %xmm6, %xmm7 320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 321 pxor %xmm7, %xmm8 322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 323.elseif \i == 7 324 pxor %xmm7, %xmm8 325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 326.endif 327 cmp $64, %r13 328 jl _initial_blocks_done\num_initial_blocks\operation 329 # no need for precomputed values 330/* 331* 332* Precomputations for HashKey parallel with encryption of first 4 blocks. 333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 334*/ 335 paddd ONE(%rip), \XMM0 # INCR Y0 336 movdqa \XMM0, \XMM1 337 movdqa SHUF_MASK(%rip), %xmm14 338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 339 340 paddd ONE(%rip), \XMM0 # INCR Y0 341 movdqa \XMM0, \XMM2 342 movdqa SHUF_MASK(%rip), %xmm14 343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 344 345 paddd ONE(%rip), \XMM0 # INCR Y0 346 movdqa \XMM0, \XMM3 347 movdqa SHUF_MASK(%rip), %xmm14 348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 349 350 paddd ONE(%rip), \XMM0 # INCR Y0 351 movdqa \XMM0, \XMM4 352 movdqa SHUF_MASK(%rip), %xmm14 353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 354 355 pxor 16*0(%arg1), \XMM1 356 pxor 16*0(%arg1), \XMM2 357 pxor 16*0(%arg1), \XMM3 358 pxor 16*0(%arg1), \XMM4 359 movdqa \TMP3, \TMP5 360 pshufd $78, \TMP3, \TMP1 361 pxor \TMP3, \TMP1 362 movdqa \TMP1, HashKey_k(%rsp) 363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 364# TMP5 = HashKey^2<<1 (mod poly) 365 movdqa \TMP5, HashKey_2(%rsp) 366# HashKey_2 = HashKey^2<<1 (mod poly) 367 pshufd $78, \TMP5, \TMP1 368 pxor \TMP5, \TMP1 369 movdqa \TMP1, HashKey_2_k(%rsp) 370.irpc index, 1234 # do 4 rounds 371 movaps 0x10*\index(%arg1), \TMP1 372 AESENC \TMP1, \XMM1 373 AESENC \TMP1, \XMM2 374 AESENC \TMP1, \XMM3 375 AESENC \TMP1, \XMM4 376.endr 377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 378# TMP5 = HashKey^3<<1 (mod poly) 379 movdqa \TMP5, HashKey_3(%rsp) 380 pshufd $78, \TMP5, \TMP1 381 pxor \TMP5, \TMP1 382 movdqa \TMP1, HashKey_3_k(%rsp) 383.irpc index, 56789 # do next 5 rounds 384 movaps 0x10*\index(%arg1), \TMP1 385 AESENC \TMP1, \XMM1 386 AESENC \TMP1, \XMM2 387 AESENC \TMP1, \XMM3 388 AESENC \TMP1, \XMM4 389.endr 390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 391# TMP5 = HashKey^3<<1 (mod poly) 392 movdqa \TMP5, HashKey_4(%rsp) 393 pshufd $78, \TMP5, \TMP1 394 pxor \TMP5, \TMP1 395 movdqa \TMP1, HashKey_4_k(%rsp) 396 movaps 0xa0(%arg1), \TMP2 397 AESENCLAST \TMP2, \XMM1 398 AESENCLAST \TMP2, \XMM2 399 AESENCLAST \TMP2, \XMM3 400 AESENCLAST \TMP2, \XMM4 401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 402 pxor \TMP1, \XMM1 403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 404 movdqa \TMP1, \XMM1 405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 406 pxor \TMP1, \XMM2 407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 408 movdqa \TMP1, \XMM2 409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 410 pxor \TMP1, \XMM3 411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 412 movdqa \TMP1, \XMM3 413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 414 pxor \TMP1, \XMM4 415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 416 movdqa \TMP1, \XMM4 417 add $64, %r11 418 movdqa SHUF_MASK(%rip), %xmm14 419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 420 pxor \XMMDst, \XMM1 421# combine GHASHed value with the corresponding ciphertext 422 movdqa SHUF_MASK(%rip), %xmm14 423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 424 movdqa SHUF_MASK(%rip), %xmm14 425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 426 movdqa SHUF_MASK(%rip), %xmm14 427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 428 429_initial_blocks_done\num_initial_blocks\operation: 430 431.endm 432 433 434/* 435* if a = number of total plaintext bytes 436* b = floor(a/16) 437* num_initial_blocks = b mod 4 438* encrypt the initial num_initial_blocks blocks and apply ghash on 439* the ciphertext 440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 441* are clobbered 442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 443*/ 444 445 446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 448 mov arg7, %r10 # %r10 = AAD 449 mov arg8, %r12 # %r12 = aadLen 450 mov %r12, %r11 451 pxor %xmm\i, %xmm\i 452_get_AAD_loop\num_initial_blocks\operation: 453 movd (%r10), \TMP1 454 pslldq $12, \TMP1 455 psrldq $4, %xmm\i 456 pxor \TMP1, %xmm\i 457 add $4, %r10 458 sub $4, %r12 459 jne _get_AAD_loop\num_initial_blocks\operation 460 cmp $16, %r11 461 je _get_AAD_loop2_done\num_initial_blocks\operation 462 mov $16, %r12 463_get_AAD_loop2\num_initial_blocks\operation: 464 psrldq $4, %xmm\i 465 sub $4, %r12 466 cmp %r11, %r12 467 jne _get_AAD_loop2\num_initial_blocks\operation 468_get_AAD_loop2_done\num_initial_blocks\operation: 469 movdqa SHUF_MASK(%rip), %xmm14 470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 471 472 xor %r11, %r11 # initialise the data pointer offset as zero 473 474 # start AES for num_initial_blocks blocks 475 476 mov %arg5, %rax # %rax = *Y0 477 movdqu (%rax), \XMM0 # XMM0 = Y0 478 movdqa SHUF_MASK(%rip), %xmm14 479 PSHUFB_XMM %xmm14, \XMM0 480 481.if (\i == 5) || (\i == 6) || (\i == 7) 482.irpc index, \i_seq 483 paddd ONE(%rip), \XMM0 # INCR Y0 484 movdqa \XMM0, %xmm\index 485 movdqa SHUF_MASK(%rip), %xmm14 486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 487 488.endr 489.irpc index, \i_seq 490 pxor 16*0(%arg1), %xmm\index 491.endr 492.irpc index, \i_seq 493 movaps 0x10(%rdi), \TMP1 494 AESENC \TMP1, %xmm\index # Round 1 495.endr 496.irpc index, \i_seq 497 movaps 0x20(%arg1), \TMP1 498 AESENC \TMP1, %xmm\index # Round 2 499.endr 500.irpc index, \i_seq 501 movaps 0x30(%arg1), \TMP1 502 AESENC \TMP1, %xmm\index # Round 2 503.endr 504.irpc index, \i_seq 505 movaps 0x40(%arg1), \TMP1 506 AESENC \TMP1, %xmm\index # Round 2 507.endr 508.irpc index, \i_seq 509 movaps 0x50(%arg1), \TMP1 510 AESENC \TMP1, %xmm\index # Round 2 511.endr 512.irpc index, \i_seq 513 movaps 0x60(%arg1), \TMP1 514 AESENC \TMP1, %xmm\index # Round 2 515.endr 516.irpc index, \i_seq 517 movaps 0x70(%arg1), \TMP1 518 AESENC \TMP1, %xmm\index # Round 2 519.endr 520.irpc index, \i_seq 521 movaps 0x80(%arg1), \TMP1 522 AESENC \TMP1, %xmm\index # Round 2 523.endr 524.irpc index, \i_seq 525 movaps 0x90(%arg1), \TMP1 526 AESENC \TMP1, %xmm\index # Round 2 527.endr 528.irpc index, \i_seq 529 movaps 0xa0(%arg1), \TMP1 530 AESENCLAST \TMP1, %xmm\index # Round 10 531.endr 532.irpc index, \i_seq 533 movdqu (%arg3 , %r11, 1), \TMP1 534 pxor \TMP1, %xmm\index 535 movdqu %xmm\index, (%arg2 , %r11, 1) 536 # write back plaintext/ciphertext for num_initial_blocks 537 add $16, %r11 538 539 movdqa SHUF_MASK(%rip), %xmm14 540 PSHUFB_XMM %xmm14, %xmm\index 541 542 # prepare plaintext/ciphertext for GHASH computation 543.endr 544.endif 545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 546 # apply GHASH on num_initial_blocks blocks 547 548.if \i == 5 549 pxor %xmm5, %xmm6 550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 551 pxor %xmm6, %xmm7 552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 553 pxor %xmm7, %xmm8 554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 555.elseif \i == 6 556 pxor %xmm6, %xmm7 557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 558 pxor %xmm7, %xmm8 559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 560.elseif \i == 7 561 pxor %xmm7, %xmm8 562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 563.endif 564 cmp $64, %r13 565 jl _initial_blocks_done\num_initial_blocks\operation 566 # no need for precomputed values 567/* 568* 569* Precomputations for HashKey parallel with encryption of first 4 blocks. 570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 571*/ 572 paddd ONE(%rip), \XMM0 # INCR Y0 573 movdqa \XMM0, \XMM1 574 movdqa SHUF_MASK(%rip), %xmm14 575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 576 577 paddd ONE(%rip), \XMM0 # INCR Y0 578 movdqa \XMM0, \XMM2 579 movdqa SHUF_MASK(%rip), %xmm14 580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 581 582 paddd ONE(%rip), \XMM0 # INCR Y0 583 movdqa \XMM0, \XMM3 584 movdqa SHUF_MASK(%rip), %xmm14 585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 586 587 paddd ONE(%rip), \XMM0 # INCR Y0 588 movdqa \XMM0, \XMM4 589 movdqa SHUF_MASK(%rip), %xmm14 590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 591 592 pxor 16*0(%arg1), \XMM1 593 pxor 16*0(%arg1), \XMM2 594 pxor 16*0(%arg1), \XMM3 595 pxor 16*0(%arg1), \XMM4 596 movdqa \TMP3, \TMP5 597 pshufd $78, \TMP3, \TMP1 598 pxor \TMP3, \TMP1 599 movdqa \TMP1, HashKey_k(%rsp) 600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 601# TMP5 = HashKey^2<<1 (mod poly) 602 movdqa \TMP5, HashKey_2(%rsp) 603# HashKey_2 = HashKey^2<<1 (mod poly) 604 pshufd $78, \TMP5, \TMP1 605 pxor \TMP5, \TMP1 606 movdqa \TMP1, HashKey_2_k(%rsp) 607.irpc index, 1234 # do 4 rounds 608 movaps 0x10*\index(%arg1), \TMP1 609 AESENC \TMP1, \XMM1 610 AESENC \TMP1, \XMM2 611 AESENC \TMP1, \XMM3 612 AESENC \TMP1, \XMM4 613.endr 614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 615# TMP5 = HashKey^3<<1 (mod poly) 616 movdqa \TMP5, HashKey_3(%rsp) 617 pshufd $78, \TMP5, \TMP1 618 pxor \TMP5, \TMP1 619 movdqa \TMP1, HashKey_3_k(%rsp) 620.irpc index, 56789 # do next 5 rounds 621 movaps 0x10*\index(%arg1), \TMP1 622 AESENC \TMP1, \XMM1 623 AESENC \TMP1, \XMM2 624 AESENC \TMP1, \XMM3 625 AESENC \TMP1, \XMM4 626.endr 627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 628# TMP5 = HashKey^3<<1 (mod poly) 629 movdqa \TMP5, HashKey_4(%rsp) 630 pshufd $78, \TMP5, \TMP1 631 pxor \TMP5, \TMP1 632 movdqa \TMP1, HashKey_4_k(%rsp) 633 movaps 0xa0(%arg1), \TMP2 634 AESENCLAST \TMP2, \XMM1 635 AESENCLAST \TMP2, \XMM2 636 AESENCLAST \TMP2, \XMM3 637 AESENCLAST \TMP2, \XMM4 638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 639 pxor \TMP1, \XMM1 640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 641 pxor \TMP1, \XMM2 642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 643 pxor \TMP1, \XMM3 644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 645 pxor \TMP1, \XMM4 646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 650 651 add $64, %r11 652 movdqa SHUF_MASK(%rip), %xmm14 653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 654 pxor \XMMDst, \XMM1 655# combine GHASHed value with the corresponding ciphertext 656 movdqa SHUF_MASK(%rip), %xmm14 657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 658 movdqa SHUF_MASK(%rip), %xmm14 659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 660 movdqa SHUF_MASK(%rip), %xmm14 661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 662 663_initial_blocks_done\num_initial_blocks\operation: 664 665.endm 666 667/* 668* encrypt 4 blocks at a time 669* ghash the 4 previously encrypted ciphertext blocks 670* arg1, %arg2, %arg3 are used as pointers only, not modified 671* %r11 is the data offset value 672*/ 673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 675 676 movdqa \XMM1, \XMM5 677 movdqa \XMM2, \XMM6 678 movdqa \XMM3, \XMM7 679 movdqa \XMM4, \XMM8 680 681 movdqa SHUF_MASK(%rip), %xmm15 682 # multiply TMP5 * HashKey using karatsuba 683 684 movdqa \XMM5, \TMP4 685 pshufd $78, \XMM5, \TMP6 686 pxor \XMM5, \TMP6 687 paddd ONE(%rip), \XMM0 # INCR CNT 688 movdqa HashKey_4(%rsp), \TMP5 689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 690 movdqa \XMM0, \XMM1 691 paddd ONE(%rip), \XMM0 # INCR CNT 692 movdqa \XMM0, \XMM2 693 paddd ONE(%rip), \XMM0 # INCR CNT 694 movdqa \XMM0, \XMM3 695 paddd ONE(%rip), \XMM0 # INCR CNT 696 movdqa \XMM0, \XMM4 697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 702 703 pxor (%arg1), \XMM1 704 pxor (%arg1), \XMM2 705 pxor (%arg1), \XMM3 706 pxor (%arg1), \XMM4 707 movdqa HashKey_4_k(%rsp), \TMP5 708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 709 movaps 0x10(%arg1), \TMP1 710 AESENC \TMP1, \XMM1 # Round 1 711 AESENC \TMP1, \XMM2 712 AESENC \TMP1, \XMM3 713 AESENC \TMP1, \XMM4 714 movaps 0x20(%arg1), \TMP1 715 AESENC \TMP1, \XMM1 # Round 2 716 AESENC \TMP1, \XMM2 717 AESENC \TMP1, \XMM3 718 AESENC \TMP1, \XMM4 719 movdqa \XMM6, \TMP1 720 pshufd $78, \XMM6, \TMP2 721 pxor \XMM6, \TMP2 722 movdqa HashKey_3(%rsp), \TMP5 723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 724 movaps 0x30(%arg1), \TMP3 725 AESENC \TMP3, \XMM1 # Round 3 726 AESENC \TMP3, \XMM2 727 AESENC \TMP3, \XMM3 728 AESENC \TMP3, \XMM4 729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 730 movaps 0x40(%arg1), \TMP3 731 AESENC \TMP3, \XMM1 # Round 4 732 AESENC \TMP3, \XMM2 733 AESENC \TMP3, \XMM3 734 AESENC \TMP3, \XMM4 735 movdqa HashKey_3_k(%rsp), \TMP5 736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 737 movaps 0x50(%arg1), \TMP3 738 AESENC \TMP3, \XMM1 # Round 5 739 AESENC \TMP3, \XMM2 740 AESENC \TMP3, \XMM3 741 AESENC \TMP3, \XMM4 742 pxor \TMP1, \TMP4 743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 744 pxor \XMM6, \XMM5 745 pxor \TMP2, \TMP6 746 movdqa \XMM7, \TMP1 747 pshufd $78, \XMM7, \TMP2 748 pxor \XMM7, \TMP2 749 movdqa HashKey_2(%rsp ), \TMP5 750 751 # Multiply TMP5 * HashKey using karatsuba 752 753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 754 movaps 0x60(%arg1), \TMP3 755 AESENC \TMP3, \XMM1 # Round 6 756 AESENC \TMP3, \XMM2 757 AESENC \TMP3, \XMM3 758 AESENC \TMP3, \XMM4 759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 760 movaps 0x70(%arg1), \TMP3 761 AESENC \TMP3, \XMM1 # Round 7 762 AESENC \TMP3, \XMM2 763 AESENC \TMP3, \XMM3 764 AESENC \TMP3, \XMM4 765 movdqa HashKey_2_k(%rsp), \TMP5 766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 767 movaps 0x80(%arg1), \TMP3 768 AESENC \TMP3, \XMM1 # Round 8 769 AESENC \TMP3, \XMM2 770 AESENC \TMP3, \XMM3 771 AESENC \TMP3, \XMM4 772 pxor \TMP1, \TMP4 773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 774 pxor \XMM7, \XMM5 775 pxor \TMP2, \TMP6 776 777 # Multiply XMM8 * HashKey 778 # XMM8 and TMP5 hold the values for the two operands 779 780 movdqa \XMM8, \TMP1 781 pshufd $78, \XMM8, \TMP2 782 pxor \XMM8, \TMP2 783 movdqa HashKey(%rsp), \TMP5 784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 785 movaps 0x90(%arg1), \TMP3 786 AESENC \TMP3, \XMM1 # Round 9 787 AESENC \TMP3, \XMM2 788 AESENC \TMP3, \XMM3 789 AESENC \TMP3, \XMM4 790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 791 movaps 0xa0(%arg1), \TMP3 792 AESENCLAST \TMP3, \XMM1 # Round 10 793 AESENCLAST \TMP3, \XMM2 794 AESENCLAST \TMP3, \XMM3 795 AESENCLAST \TMP3, \XMM4 796 movdqa HashKey_k(%rsp), \TMP5 797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 798 movdqu (%arg3,%r11,1), \TMP3 799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 800 movdqu 16(%arg3,%r11,1), \TMP3 801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 802 movdqu 32(%arg3,%r11,1), \TMP3 803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 804 movdqu 48(%arg3,%r11,1), \TMP3 805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 814 815 pxor \TMP4, \TMP1 816 pxor \XMM8, \XMM5 817 pxor \TMP6, \TMP2 818 pxor \TMP1, \TMP2 819 pxor \XMM5, \TMP2 820 movdqa \TMP2, \TMP3 821 pslldq $8, \TMP3 # left shift TMP3 2 DWs 822 psrldq $8, \TMP2 # right shift TMP2 2 DWs 823 pxor \TMP3, \XMM5 824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 825 826 # first phase of reduction 827 828 movdqa \XMM5, \TMP2 829 movdqa \XMM5, \TMP3 830 movdqa \XMM5, \TMP4 831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 832 pslld $31, \TMP2 # packed right shift << 31 833 pslld $30, \TMP3 # packed right shift << 30 834 pslld $25, \TMP4 # packed right shift << 25 835 pxor \TMP3, \TMP2 # xor the shifted versions 836 pxor \TMP4, \TMP2 837 movdqa \TMP2, \TMP5 838 psrldq $4, \TMP5 # right shift T5 1 DW 839 pslldq $12, \TMP2 # left shift T2 3 DWs 840 pxor \TMP2, \XMM5 841 842 # second phase of reduction 843 844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 845 movdqa \XMM5,\TMP3 846 movdqa \XMM5,\TMP4 847 psrld $1, \TMP2 # packed left shift >>1 848 psrld $2, \TMP3 # packed left shift >>2 849 psrld $7, \TMP4 # packed left shift >>7 850 pxor \TMP3,\TMP2 # xor the shifted versions 851 pxor \TMP4,\TMP2 852 pxor \TMP5, \TMP2 853 pxor \TMP2, \XMM5 854 pxor \TMP1, \XMM5 # result is in TMP1 855 856 pxor \XMM5, \XMM1 857.endm 858 859/* 860* decrypt 4 blocks at a time 861* ghash the 4 previously decrypted ciphertext blocks 862* arg1, %arg2, %arg3 are used as pointers only, not modified 863* %r11 is the data offset value 864*/ 865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 867 868 movdqa \XMM1, \XMM5 869 movdqa \XMM2, \XMM6 870 movdqa \XMM3, \XMM7 871 movdqa \XMM4, \XMM8 872 873 movdqa SHUF_MASK(%rip), %xmm15 874 # multiply TMP5 * HashKey using karatsuba 875 876 movdqa \XMM5, \TMP4 877 pshufd $78, \XMM5, \TMP6 878 pxor \XMM5, \TMP6 879 paddd ONE(%rip), \XMM0 # INCR CNT 880 movdqa HashKey_4(%rsp), \TMP5 881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 882 movdqa \XMM0, \XMM1 883 paddd ONE(%rip), \XMM0 # INCR CNT 884 movdqa \XMM0, \XMM2 885 paddd ONE(%rip), \XMM0 # INCR CNT 886 movdqa \XMM0, \XMM3 887 paddd ONE(%rip), \XMM0 # INCR CNT 888 movdqa \XMM0, \XMM4 889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 894 895 pxor (%arg1), \XMM1 896 pxor (%arg1), \XMM2 897 pxor (%arg1), \XMM3 898 pxor (%arg1), \XMM4 899 movdqa HashKey_4_k(%rsp), \TMP5 900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 901 movaps 0x10(%arg1), \TMP1 902 AESENC \TMP1, \XMM1 # Round 1 903 AESENC \TMP1, \XMM2 904 AESENC \TMP1, \XMM3 905 AESENC \TMP1, \XMM4 906 movaps 0x20(%arg1), \TMP1 907 AESENC \TMP1, \XMM1 # Round 2 908 AESENC \TMP1, \XMM2 909 AESENC \TMP1, \XMM3 910 AESENC \TMP1, \XMM4 911 movdqa \XMM6, \TMP1 912 pshufd $78, \XMM6, \TMP2 913 pxor \XMM6, \TMP2 914 movdqa HashKey_3(%rsp), \TMP5 915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 916 movaps 0x30(%arg1), \TMP3 917 AESENC \TMP3, \XMM1 # Round 3 918 AESENC \TMP3, \XMM2 919 AESENC \TMP3, \XMM3 920 AESENC \TMP3, \XMM4 921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 922 movaps 0x40(%arg1), \TMP3 923 AESENC \TMP3, \XMM1 # Round 4 924 AESENC \TMP3, \XMM2 925 AESENC \TMP3, \XMM3 926 AESENC \TMP3, \XMM4 927 movdqa HashKey_3_k(%rsp), \TMP5 928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 929 movaps 0x50(%arg1), \TMP3 930 AESENC \TMP3, \XMM1 # Round 5 931 AESENC \TMP3, \XMM2 932 AESENC \TMP3, \XMM3 933 AESENC \TMP3, \XMM4 934 pxor \TMP1, \TMP4 935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 936 pxor \XMM6, \XMM5 937 pxor \TMP2, \TMP6 938 movdqa \XMM7, \TMP1 939 pshufd $78, \XMM7, \TMP2 940 pxor \XMM7, \TMP2 941 movdqa HashKey_2(%rsp ), \TMP5 942 943 # Multiply TMP5 * HashKey using karatsuba 944 945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 946 movaps 0x60(%arg1), \TMP3 947 AESENC \TMP3, \XMM1 # Round 6 948 AESENC \TMP3, \XMM2 949 AESENC \TMP3, \XMM3 950 AESENC \TMP3, \XMM4 951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 952 movaps 0x70(%arg1), \TMP3 953 AESENC \TMP3, \XMM1 # Round 7 954 AESENC \TMP3, \XMM2 955 AESENC \TMP3, \XMM3 956 AESENC \TMP3, \XMM4 957 movdqa HashKey_2_k(%rsp), \TMP5 958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 959 movaps 0x80(%arg1), \TMP3 960 AESENC \TMP3, \XMM1 # Round 8 961 AESENC \TMP3, \XMM2 962 AESENC \TMP3, \XMM3 963 AESENC \TMP3, \XMM4 964 pxor \TMP1, \TMP4 965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 966 pxor \XMM7, \XMM5 967 pxor \TMP2, \TMP6 968 969 # Multiply XMM8 * HashKey 970 # XMM8 and TMP5 hold the values for the two operands 971 972 movdqa \XMM8, \TMP1 973 pshufd $78, \XMM8, \TMP2 974 pxor \XMM8, \TMP2 975 movdqa HashKey(%rsp), \TMP5 976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 977 movaps 0x90(%arg1), \TMP3 978 AESENC \TMP3, \XMM1 # Round 9 979 AESENC \TMP3, \XMM2 980 AESENC \TMP3, \XMM3 981 AESENC \TMP3, \XMM4 982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 983 movaps 0xa0(%arg1), \TMP3 984 AESENCLAST \TMP3, \XMM1 # Round 10 985 AESENCLAST \TMP3, \XMM2 986 AESENCLAST \TMP3, \XMM3 987 AESENCLAST \TMP3, \XMM4 988 movdqa HashKey_k(%rsp), \TMP5 989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 990 movdqu (%arg3,%r11,1), \TMP3 991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 993 movdqa \TMP3, \XMM1 994 movdqu 16(%arg3,%r11,1), \TMP3 995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 997 movdqa \TMP3, \XMM2 998 movdqu 32(%arg3,%r11,1), \TMP3 999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1001 movdqa \TMP3, \XMM3 1002 movdqu 48(%arg3,%r11,1), \TMP3 1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1005 movdqa \TMP3, \XMM4 1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1010 1011 pxor \TMP4, \TMP1 1012 pxor \XMM8, \XMM5 1013 pxor \TMP6, \TMP2 1014 pxor \TMP1, \TMP2 1015 pxor \XMM5, \TMP2 1016 movdqa \TMP2, \TMP3 1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1019 pxor \TMP3, \XMM5 1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1021 1022 # first phase of reduction 1023 1024 movdqa \XMM5, \TMP2 1025 movdqa \XMM5, \TMP3 1026 movdqa \XMM5, \TMP4 1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1028 pslld $31, \TMP2 # packed right shift << 31 1029 pslld $30, \TMP3 # packed right shift << 30 1030 pslld $25, \TMP4 # packed right shift << 25 1031 pxor \TMP3, \TMP2 # xor the shifted versions 1032 pxor \TMP4, \TMP2 1033 movdqa \TMP2, \TMP5 1034 psrldq $4, \TMP5 # right shift T5 1 DW 1035 pslldq $12, \TMP2 # left shift T2 3 DWs 1036 pxor \TMP2, \XMM5 1037 1038 # second phase of reduction 1039 1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1041 movdqa \XMM5,\TMP3 1042 movdqa \XMM5,\TMP4 1043 psrld $1, \TMP2 # packed left shift >>1 1044 psrld $2, \TMP3 # packed left shift >>2 1045 psrld $7, \TMP4 # packed left shift >>7 1046 pxor \TMP3,\TMP2 # xor the shifted versions 1047 pxor \TMP4,\TMP2 1048 pxor \TMP5, \TMP2 1049 pxor \TMP2, \XMM5 1050 pxor \TMP1, \XMM5 # result is in TMP1 1051 1052 pxor \XMM5, \XMM1 1053.endm 1054 1055/* GHASH the last 4 ciphertext blocks. */ 1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1058 1059 # Multiply TMP6 * HashKey (using Karatsuba) 1060 1061 movdqa \XMM1, \TMP6 1062 pshufd $78, \XMM1, \TMP2 1063 pxor \XMM1, \TMP2 1064 movdqa HashKey_4(%rsp), \TMP5 1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1067 movdqa HashKey_4_k(%rsp), \TMP4 1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1069 movdqa \XMM1, \XMMDst 1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1071 1072 # Multiply TMP1 * HashKey (using Karatsuba) 1073 1074 movdqa \XMM2, \TMP1 1075 pshufd $78, \XMM2, \TMP2 1076 pxor \XMM2, \TMP2 1077 movdqa HashKey_3(%rsp), \TMP5 1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1080 movdqa HashKey_3_k(%rsp), \TMP4 1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1082 pxor \TMP1, \TMP6 1083 pxor \XMM2, \XMMDst 1084 pxor \TMP2, \XMM1 1085# results accumulated in TMP6, XMMDst, XMM1 1086 1087 # Multiply TMP1 * HashKey (using Karatsuba) 1088 1089 movdqa \XMM3, \TMP1 1090 pshufd $78, \XMM3, \TMP2 1091 pxor \XMM3, \TMP2 1092 movdqa HashKey_2(%rsp), \TMP5 1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1095 movdqa HashKey_2_k(%rsp), \TMP4 1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1097 pxor \TMP1, \TMP6 1098 pxor \XMM3, \XMMDst 1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1100 1101 # Multiply TMP1 * HashKey (using Karatsuba) 1102 movdqa \XMM4, \TMP1 1103 pshufd $78, \XMM4, \TMP2 1104 pxor \XMM4, \TMP2 1105 movdqa HashKey(%rsp), \TMP5 1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1108 movdqa HashKey_k(%rsp), \TMP4 1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1110 pxor \TMP1, \TMP6 1111 pxor \XMM4, \XMMDst 1112 pxor \XMM1, \TMP2 1113 pxor \TMP6, \TMP2 1114 pxor \XMMDst, \TMP2 1115 # middle section of the temp results combined as in karatsuba algorithm 1116 movdqa \TMP2, \TMP4 1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1119 pxor \TMP4, \XMMDst 1120 pxor \TMP2, \TMP6 1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1122 # first phase of the reduction 1123 movdqa \XMMDst, \TMP2 1124 movdqa \XMMDst, \TMP3 1125 movdqa \XMMDst, \TMP4 1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1127 pslld $31, \TMP2 # packed right shifting << 31 1128 pslld $30, \TMP3 # packed right shifting << 30 1129 pslld $25, \TMP4 # packed right shifting << 25 1130 pxor \TMP3, \TMP2 # xor the shifted versions 1131 pxor \TMP4, \TMP2 1132 movdqa \TMP2, \TMP7 1133 psrldq $4, \TMP7 # right shift TMP7 1 DW 1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1135 pxor \TMP2, \XMMDst 1136 1137 # second phase of the reduction 1138 movdqa \XMMDst, \TMP2 1139 # make 3 copies of XMMDst for doing 3 shift operations 1140 movdqa \XMMDst, \TMP3 1141 movdqa \XMMDst, \TMP4 1142 psrld $1, \TMP2 # packed left shift >> 1 1143 psrld $2, \TMP3 # packed left shift >> 2 1144 psrld $7, \TMP4 # packed left shift >> 7 1145 pxor \TMP3, \TMP2 # xor the shifted versions 1146 pxor \TMP4, \TMP2 1147 pxor \TMP7, \TMP2 1148 pxor \TMP2, \XMMDst 1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1150.endm 1151 1152/* Encryption of a single block done*/ 1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1154 1155 pxor (%arg1), \XMM0 1156 movaps 16(%arg1), \TMP1 1157 AESENC \TMP1, \XMM0 1158 movaps 32(%arg1), \TMP1 1159 AESENC \TMP1, \XMM0 1160 movaps 48(%arg1), \TMP1 1161 AESENC \TMP1, \XMM0 1162 movaps 64(%arg1), \TMP1 1163 AESENC \TMP1, \XMM0 1164 movaps 80(%arg1), \TMP1 1165 AESENC \TMP1, \XMM0 1166 movaps 96(%arg1), \TMP1 1167 AESENC \TMP1, \XMM0 1168 movaps 112(%arg1), \TMP1 1169 AESENC \TMP1, \XMM0 1170 movaps 128(%arg1), \TMP1 1171 AESENC \TMP1, \XMM0 1172 movaps 144(%arg1), \TMP1 1173 AESENC \TMP1, \XMM0 1174 movaps 160(%arg1), \TMP1 1175 AESENCLAST \TMP1, \XMM0 1176.endm 1177 1178 1179/***************************************************************************** 1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1181* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1182* const u8 *in, // Ciphertext input 1183* u64 plaintext_len, // Length of data in bytes for decryption. 1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1186* // concatenated with 0x00000001. 16-byte aligned pointer. 1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1188* const u8 *aad, // Additional Authentication Data (AAD) 1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1191* // given authentication tag and only return the plaintext if they match. 1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1193* // (most likely), 12 or 8. 1194* 1195* Assumptions: 1196* 1197* keys: 1198* keys are pre-expanded and aligned to 16 bytes. we are using the first 1199* set of 11 keys in the data structure void *aes_ctx 1200* 1201* iv: 1202* 0 1 2 3 1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1205* | Salt (From the SA) | 1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1207* | Initialization Vector | 1208* | (This is the sequence number from IPSec header) | 1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1210* | 0x1 | 1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1212* 1213* 1214* 1215* AAD: 1216* AAD padded to 128 bits with 0 1217* for example, assume AAD is a u32 vector 1218* 1219* if AAD is 8 bytes: 1220* AAD[3] = {A0, A1}; 1221* padded AAD in xmm register = {A1 A0 0 0} 1222* 1223* 0 1 2 3 1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1226* | SPI (A1) | 1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1228* | 32-bit Sequence Number (A0) | 1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1230* | 0x0 | 1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1232* 1233* AAD Format with 32-bit Sequence Number 1234* 1235* if AAD is 12 bytes: 1236* AAD[3] = {A0, A1, A2}; 1237* padded AAD in xmm register = {A2 A1 A0 0} 1238* 1239* 0 1 2 3 1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1244* | SPI (A2) | 1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1246* | 64-bit Extended Sequence Number {A1,A0} | 1247* | | 1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1249* | 0x0 | 1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1251* 1252* AAD Format with 64-bit Extended Sequence Number 1253* 1254* aadLen: 1255* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1256* The code supports 16 too but for other sizes, the code will fail. 1257* 1258* TLen: 1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1260* For other sizes, the code will fail. 1261* 1262* poly = x^128 + x^127 + x^126 + x^121 + 1 1263* 1264*****************************************************************************/ 1265 1266ENTRY(aesni_gcm_dec) 1267 push %r12 1268 push %r13 1269 push %r14 1270 mov %rsp, %r14 1271/* 1272* states of %xmm registers %xmm6:%xmm15 not saved 1273* all %xmm registers are clobbered 1274*/ 1275 sub $VARIABLE_OFFSET, %rsp 1276 and $~63, %rsp # align rsp to 64 bytes 1277 mov %arg6, %r12 1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1279 movdqa SHUF_MASK(%rip), %xmm2 1280 PSHUFB_XMM %xmm2, %xmm13 1281 1282 1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1284 1285 movdqa %xmm13, %xmm2 1286 psllq $1, %xmm13 1287 psrlq $63, %xmm2 1288 movdqa %xmm2, %xmm1 1289 pslldq $8, %xmm2 1290 psrldq $8, %xmm1 1291 por %xmm2, %xmm13 1292 1293 # Reduction 1294 1295 pshufd $0x24, %xmm1, %xmm2 1296 pcmpeqd TWOONE(%rip), %xmm2 1297 pand POLY(%rip), %xmm2 1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1299 1300 1301 # Decrypt first few blocks 1302 1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1306 mov %r13, %r12 1307 and $(3<<4), %r12 1308 jz _initial_num_blocks_is_0_decrypt 1309 cmp $(2<<4), %r12 1310 jb _initial_num_blocks_is_1_decrypt 1311 je _initial_num_blocks_is_2_decrypt 1312_initial_num_blocks_is_3_decrypt: 1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1315 sub $48, %r13 1316 jmp _initial_blocks_decrypted 1317_initial_num_blocks_is_2_decrypt: 1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1320 sub $32, %r13 1321 jmp _initial_blocks_decrypted 1322_initial_num_blocks_is_1_decrypt: 1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1325 sub $16, %r13 1326 jmp _initial_blocks_decrypted 1327_initial_num_blocks_is_0_decrypt: 1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1330_initial_blocks_decrypted: 1331 cmp $0, %r13 1332 je _zero_cipher_left_decrypt 1333 sub $64, %r13 1334 je _four_cipher_left_decrypt 1335_decrypt_by_4: 1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1338 add $64, %r11 1339 sub $64, %r13 1340 jne _decrypt_by_4 1341_four_cipher_left_decrypt: 1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1344_zero_cipher_left_decrypt: 1345 mov %arg4, %r13 1346 and $15, %r13 # %r13 = arg4 (mod 16) 1347 je _multiple_of_16_bytes_decrypt 1348 1349 # Handle the last <16 byte block seperately 1350 1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1352 movdqa SHUF_MASK(%rip), %xmm10 1353 PSHUFB_XMM %xmm10, %xmm0 1354 1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1356 sub $16, %r11 1357 add %r13, %r11 1358 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block 1359 lea SHIFT_MASK+16(%rip), %r12 1360 sub %r13, %r12 1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1362# (%r13 is the number of bytes in plaintext mod 16) 1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1365 1366 movdqa %xmm1, %xmm2 1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1371 pand %xmm1, %xmm2 1372 movdqa SHUF_MASK(%rip), %xmm10 1373 PSHUFB_XMM %xmm10 ,%xmm2 1374 1375 pxor %xmm2, %xmm8 1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1377 # GHASH computation for the last <16 byte block 1378 sub %r13, %r11 1379 add $16, %r11 1380 1381 # output %r13 bytes 1382 MOVQ_R64_XMM %xmm0, %rax 1383 cmp $8, %r13 1384 jle _less_than_8_bytes_left_decrypt 1385 mov %rax, (%arg2 , %r11, 1) 1386 add $8, %r11 1387 psrldq $8, %xmm0 1388 MOVQ_R64_XMM %xmm0, %rax 1389 sub $8, %r13 1390_less_than_8_bytes_left_decrypt: 1391 mov %al, (%arg2, %r11, 1) 1392 add $1, %r11 1393 shr $8, %rax 1394 sub $1, %r13 1395 jne _less_than_8_bytes_left_decrypt 1396_multiple_of_16_bytes_decrypt: 1397 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1398 shl $3, %r12 # convert into number of bits 1399 movd %r12d, %xmm15 # len(A) in %xmm15 1400 shl $3, %arg4 # len(C) in bits (*128) 1401 MOVQ_R64_XMM %arg4, %xmm1 1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1404 pxor %xmm15, %xmm8 1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1406 # final GHASH computation 1407 movdqa SHUF_MASK(%rip), %xmm10 1408 PSHUFB_XMM %xmm10, %xmm8 1409 1410 mov %arg5, %rax # %rax = *Y0 1411 movdqu (%rax), %xmm0 # %xmm0 = Y0 1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1413 pxor %xmm8, %xmm0 1414_return_T_decrypt: 1415 mov arg9, %r10 # %r10 = authTag 1416 mov arg10, %r11 # %r11 = auth_tag_len 1417 cmp $16, %r11 1418 je _T_16_decrypt 1419 cmp $12, %r11 1420 je _T_12_decrypt 1421_T_8_decrypt: 1422 MOVQ_R64_XMM %xmm0, %rax 1423 mov %rax, (%r10) 1424 jmp _return_T_done_decrypt 1425_T_12_decrypt: 1426 MOVQ_R64_XMM %xmm0, %rax 1427 mov %rax, (%r10) 1428 psrldq $8, %xmm0 1429 movd %xmm0, %eax 1430 mov %eax, 8(%r10) 1431 jmp _return_T_done_decrypt 1432_T_16_decrypt: 1433 movdqu %xmm0, (%r10) 1434_return_T_done_decrypt: 1435 mov %r14, %rsp 1436 pop %r14 1437 pop %r13 1438 pop %r12 1439 ret 1440 1441 1442/***************************************************************************** 1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1445* const u8 *in, // Plaintext input 1446* u64 plaintext_len, // Length of data in bytes for encryption. 1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1449* // concatenated with 0x00000001. 16-byte aligned pointer. 1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1451* const u8 *aad, // Additional Authentication Data (AAD) 1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1453* u8 *auth_tag, // Authenticated Tag output. 1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1455* // 12 or 8. 1456* 1457* Assumptions: 1458* 1459* keys: 1460* keys are pre-expanded and aligned to 16 bytes. we are using the 1461* first set of 11 keys in the data structure void *aes_ctx 1462* 1463* 1464* iv: 1465* 0 1 2 3 1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1468* | Salt (From the SA) | 1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1470* | Initialization Vector | 1471* | (This is the sequence number from IPSec header) | 1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1473* | 0x1 | 1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1475* 1476* 1477* 1478* AAD: 1479* AAD padded to 128 bits with 0 1480* for example, assume AAD is a u32 vector 1481* 1482* if AAD is 8 bytes: 1483* AAD[3] = {A0, A1}; 1484* padded AAD in xmm register = {A1 A0 0 0} 1485* 1486* 0 1 2 3 1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1489* | SPI (A1) | 1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1491* | 32-bit Sequence Number (A0) | 1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1493* | 0x0 | 1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1495* 1496* AAD Format with 32-bit Sequence Number 1497* 1498* if AAD is 12 bytes: 1499* AAD[3] = {A0, A1, A2}; 1500* padded AAD in xmm register = {A2 A1 A0 0} 1501* 1502* 0 1 2 3 1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1505* | SPI (A2) | 1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1507* | 64-bit Extended Sequence Number {A1,A0} | 1508* | | 1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1510* | 0x0 | 1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1512* 1513* AAD Format with 64-bit Extended Sequence Number 1514* 1515* aadLen: 1516* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1517* The code supports 16 too but for other sizes, the code will fail. 1518* 1519* TLen: 1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1521* For other sizes, the code will fail. 1522* 1523* poly = x^128 + x^127 + x^126 + x^121 + 1 1524***************************************************************************/ 1525ENTRY(aesni_gcm_enc) 1526 push %r12 1527 push %r13 1528 push %r14 1529 mov %rsp, %r14 1530# 1531# states of %xmm registers %xmm6:%xmm15 not saved 1532# all %xmm registers are clobbered 1533# 1534 sub $VARIABLE_OFFSET, %rsp 1535 and $~63, %rsp 1536 mov %arg6, %r12 1537 movdqu (%r12), %xmm13 1538 movdqa SHUF_MASK(%rip), %xmm2 1539 PSHUFB_XMM %xmm2, %xmm13 1540 1541 1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1543 1544 movdqa %xmm13, %xmm2 1545 psllq $1, %xmm13 1546 psrlq $63, %xmm2 1547 movdqa %xmm2, %xmm1 1548 pslldq $8, %xmm2 1549 psrldq $8, %xmm1 1550 por %xmm2, %xmm13 1551 1552 # reduce HashKey<<1 1553 1554 pshufd $0x24, %xmm1, %xmm2 1555 pcmpeqd TWOONE(%rip), %xmm2 1556 pand POLY(%rip), %xmm2 1557 pxor %xmm2, %xmm13 1558 movdqa %xmm13, HashKey(%rsp) 1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1560 and $-16, %r13 1561 mov %r13, %r12 1562 1563 # Encrypt first few blocks 1564 1565 and $(3<<4), %r12 1566 jz _initial_num_blocks_is_0_encrypt 1567 cmp $(2<<4), %r12 1568 jb _initial_num_blocks_is_1_encrypt 1569 je _initial_num_blocks_is_2_encrypt 1570_initial_num_blocks_is_3_encrypt: 1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1573 sub $48, %r13 1574 jmp _initial_blocks_encrypted 1575_initial_num_blocks_is_2_encrypt: 1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1578 sub $32, %r13 1579 jmp _initial_blocks_encrypted 1580_initial_num_blocks_is_1_encrypt: 1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1583 sub $16, %r13 1584 jmp _initial_blocks_encrypted 1585_initial_num_blocks_is_0_encrypt: 1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1588_initial_blocks_encrypted: 1589 1590 # Main loop - Encrypt remaining blocks 1591 1592 cmp $0, %r13 1593 je _zero_cipher_left_encrypt 1594 sub $64, %r13 1595 je _four_cipher_left_encrypt 1596_encrypt_by_4_encrypt: 1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1599 add $64, %r11 1600 sub $64, %r13 1601 jne _encrypt_by_4_encrypt 1602_four_cipher_left_encrypt: 1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1605_zero_cipher_left_encrypt: 1606 mov %arg4, %r13 1607 and $15, %r13 # %r13 = arg4 (mod 16) 1608 je _multiple_of_16_bytes_encrypt 1609 1610 # Handle the last <16 Byte block seperately 1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1612 movdqa SHUF_MASK(%rip), %xmm10 1613 PSHUFB_XMM %xmm10, %xmm0 1614 1615 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1616 sub $16, %r11 1617 add %r13, %r11 1618 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1619 lea SHIFT_MASK+16(%rip), %r12 1620 sub %r13, %r12 1621 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1622 # (%r13 is the number of bytes in plaintext mod 16) 1623 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1624 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1625 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1626 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1627 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1628 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1629 movdqa SHUF_MASK(%rip), %xmm10 1630 PSHUFB_XMM %xmm10,%xmm0 1631 1632 pxor %xmm0, %xmm8 1633 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1634 # GHASH computation for the last <16 byte block 1635 sub %r13, %r11 1636 add $16, %r11 1637 PSHUFB_XMM %xmm10, %xmm1 1638 1639 # shuffle xmm0 back to output as ciphertext 1640 1641 # Output %r13 bytes 1642 MOVQ_R64_XMM %xmm0, %rax 1643 cmp $8, %r13 1644 jle _less_than_8_bytes_left_encrypt 1645 mov %rax, (%arg2 , %r11, 1) 1646 add $8, %r11 1647 psrldq $8, %xmm0 1648 MOVQ_R64_XMM %xmm0, %rax 1649 sub $8, %r13 1650_less_than_8_bytes_left_encrypt: 1651 mov %al, (%arg2, %r11, 1) 1652 add $1, %r11 1653 shr $8, %rax 1654 sub $1, %r13 1655 jne _less_than_8_bytes_left_encrypt 1656_multiple_of_16_bytes_encrypt: 1657 mov arg8, %r12 # %r12 = addLen (number of bytes) 1658 shl $3, %r12 1659 movd %r12d, %xmm15 # len(A) in %xmm15 1660 shl $3, %arg4 # len(C) in bits (*128) 1661 MOVQ_R64_XMM %arg4, %xmm1 1662 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1663 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1664 pxor %xmm15, %xmm8 1665 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1666 # final GHASH computation 1667 movdqa SHUF_MASK(%rip), %xmm10 1668 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1669 1670 mov %arg5, %rax # %rax = *Y0 1671 movdqu (%rax), %xmm0 # %xmm0 = Y0 1672 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1673 pxor %xmm8, %xmm0 1674_return_T_encrypt: 1675 mov arg9, %r10 # %r10 = authTag 1676 mov arg10, %r11 # %r11 = auth_tag_len 1677 cmp $16, %r11 1678 je _T_16_encrypt 1679 cmp $12, %r11 1680 je _T_12_encrypt 1681_T_8_encrypt: 1682 MOVQ_R64_XMM %xmm0, %rax 1683 mov %rax, (%r10) 1684 jmp _return_T_done_encrypt 1685_T_12_encrypt: 1686 MOVQ_R64_XMM %xmm0, %rax 1687 mov %rax, (%r10) 1688 psrldq $8, %xmm0 1689 movd %xmm0, %eax 1690 mov %eax, 8(%r10) 1691 jmp _return_T_done_encrypt 1692_T_16_encrypt: 1693 movdqu %xmm0, (%r10) 1694_return_T_done_encrypt: 1695 mov %r14, %rsp 1696 pop %r14 1697 pop %r13 1698 pop %r12 1699 ret 1700 1701#endif 1702 1703 1704_key_expansion_128: 1705_key_expansion_256a: 1706 pshufd $0b11111111, %xmm1, %xmm1 1707 shufps $0b00010000, %xmm0, %xmm4 1708 pxor %xmm4, %xmm0 1709 shufps $0b10001100, %xmm0, %xmm4 1710 pxor %xmm4, %xmm0 1711 pxor %xmm1, %xmm0 1712 movaps %xmm0, (TKEYP) 1713 add $0x10, TKEYP 1714 ret 1715 1716.align 4 1717_key_expansion_192a: 1718 pshufd $0b01010101, %xmm1, %xmm1 1719 shufps $0b00010000, %xmm0, %xmm4 1720 pxor %xmm4, %xmm0 1721 shufps $0b10001100, %xmm0, %xmm4 1722 pxor %xmm4, %xmm0 1723 pxor %xmm1, %xmm0 1724 1725 movaps %xmm2, %xmm5 1726 movaps %xmm2, %xmm6 1727 pslldq $4, %xmm5 1728 pshufd $0b11111111, %xmm0, %xmm3 1729 pxor %xmm3, %xmm2 1730 pxor %xmm5, %xmm2 1731 1732 movaps %xmm0, %xmm1 1733 shufps $0b01000100, %xmm0, %xmm6 1734 movaps %xmm6, (TKEYP) 1735 shufps $0b01001110, %xmm2, %xmm1 1736 movaps %xmm1, 0x10(TKEYP) 1737 add $0x20, TKEYP 1738 ret 1739 1740.align 4 1741_key_expansion_192b: 1742 pshufd $0b01010101, %xmm1, %xmm1 1743 shufps $0b00010000, %xmm0, %xmm4 1744 pxor %xmm4, %xmm0 1745 shufps $0b10001100, %xmm0, %xmm4 1746 pxor %xmm4, %xmm0 1747 pxor %xmm1, %xmm0 1748 1749 movaps %xmm2, %xmm5 1750 pslldq $4, %xmm5 1751 pshufd $0b11111111, %xmm0, %xmm3 1752 pxor %xmm3, %xmm2 1753 pxor %xmm5, %xmm2 1754 1755 movaps %xmm0, (TKEYP) 1756 add $0x10, TKEYP 1757 ret 1758 1759.align 4 1760_key_expansion_256b: 1761 pshufd $0b10101010, %xmm1, %xmm1 1762 shufps $0b00010000, %xmm2, %xmm4 1763 pxor %xmm4, %xmm2 1764 shufps $0b10001100, %xmm2, %xmm4 1765 pxor %xmm4, %xmm2 1766 pxor %xmm1, %xmm2 1767 movaps %xmm2, (TKEYP) 1768 add $0x10, TKEYP 1769 ret 1770 1771/* 1772 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1773 * unsigned int key_len) 1774 */ 1775ENTRY(aesni_set_key) 1776#ifndef __x86_64__ 1777 pushl KEYP 1778 movl 8(%esp), KEYP # ctx 1779 movl 12(%esp), UKEYP # in_key 1780 movl 16(%esp), %edx # key_len 1781#endif 1782 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1783 movaps %xmm0, (KEYP) 1784 lea 0x10(KEYP), TKEYP # key addr 1785 movl %edx, 480(KEYP) 1786 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1787 cmp $24, %dl 1788 jb .Lenc_key128 1789 je .Lenc_key192 1790 movups 0x10(UKEYP), %xmm2 # other user key 1791 movaps %xmm2, (TKEYP) 1792 add $0x10, TKEYP 1793 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1794 call _key_expansion_256a 1795 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1796 call _key_expansion_256b 1797 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1798 call _key_expansion_256a 1799 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1800 call _key_expansion_256b 1801 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1802 call _key_expansion_256a 1803 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1804 call _key_expansion_256b 1805 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1806 call _key_expansion_256a 1807 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1808 call _key_expansion_256b 1809 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1810 call _key_expansion_256a 1811 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1812 call _key_expansion_256b 1813 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1814 call _key_expansion_256a 1815 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1816 call _key_expansion_256b 1817 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1818 call _key_expansion_256a 1819 jmp .Ldec_key 1820.Lenc_key192: 1821 movq 0x10(UKEYP), %xmm2 # other user key 1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1823 call _key_expansion_192a 1824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1825 call _key_expansion_192b 1826 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1827 call _key_expansion_192a 1828 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1829 call _key_expansion_192b 1830 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1831 call _key_expansion_192a 1832 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1833 call _key_expansion_192b 1834 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1835 call _key_expansion_192a 1836 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1837 call _key_expansion_192b 1838 jmp .Ldec_key 1839.Lenc_key128: 1840 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1841 call _key_expansion_128 1842 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1843 call _key_expansion_128 1844 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1845 call _key_expansion_128 1846 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1847 call _key_expansion_128 1848 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1849 call _key_expansion_128 1850 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1851 call _key_expansion_128 1852 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1853 call _key_expansion_128 1854 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1855 call _key_expansion_128 1856 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1857 call _key_expansion_128 1858 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1859 call _key_expansion_128 1860.Ldec_key: 1861 sub $0x10, TKEYP 1862 movaps (KEYP), %xmm0 1863 movaps (TKEYP), %xmm1 1864 movaps %xmm0, 240(TKEYP) 1865 movaps %xmm1, 240(KEYP) 1866 add $0x10, KEYP 1867 lea 240-16(TKEYP), UKEYP 1868.align 4 1869.Ldec_key_loop: 1870 movaps (KEYP), %xmm0 1871 AESIMC %xmm0 %xmm1 1872 movaps %xmm1, (UKEYP) 1873 add $0x10, KEYP 1874 sub $0x10, UKEYP 1875 cmp TKEYP, KEYP 1876 jb .Ldec_key_loop 1877 xor AREG, AREG 1878#ifndef __x86_64__ 1879 popl KEYP 1880#endif 1881 ret 1882 1883/* 1884 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1885 */ 1886ENTRY(aesni_enc) 1887#ifndef __x86_64__ 1888 pushl KEYP 1889 pushl KLEN 1890 movl 12(%esp), KEYP 1891 movl 16(%esp), OUTP 1892 movl 20(%esp), INP 1893#endif 1894 movl 480(KEYP), KLEN # key length 1895 movups (INP), STATE # input 1896 call _aesni_enc1 1897 movups STATE, (OUTP) # output 1898#ifndef __x86_64__ 1899 popl KLEN 1900 popl KEYP 1901#endif 1902 ret 1903 1904/* 1905 * _aesni_enc1: internal ABI 1906 * input: 1907 * KEYP: key struct pointer 1908 * KLEN: round count 1909 * STATE: initial state (input) 1910 * output: 1911 * STATE: finial state (output) 1912 * changed: 1913 * KEY 1914 * TKEYP (T1) 1915 */ 1916.align 4 1917_aesni_enc1: 1918 movaps (KEYP), KEY # key 1919 mov KEYP, TKEYP 1920 pxor KEY, STATE # round 0 1921 add $0x30, TKEYP 1922 cmp $24, KLEN 1923 jb .Lenc128 1924 lea 0x20(TKEYP), TKEYP 1925 je .Lenc192 1926 add $0x20, TKEYP 1927 movaps -0x60(TKEYP), KEY 1928 AESENC KEY STATE 1929 movaps -0x50(TKEYP), KEY 1930 AESENC KEY STATE 1931.align 4 1932.Lenc192: 1933 movaps -0x40(TKEYP), KEY 1934 AESENC KEY STATE 1935 movaps -0x30(TKEYP), KEY 1936 AESENC KEY STATE 1937.align 4 1938.Lenc128: 1939 movaps -0x20(TKEYP), KEY 1940 AESENC KEY STATE 1941 movaps -0x10(TKEYP), KEY 1942 AESENC KEY STATE 1943 movaps (TKEYP), KEY 1944 AESENC KEY STATE 1945 movaps 0x10(TKEYP), KEY 1946 AESENC KEY STATE 1947 movaps 0x20(TKEYP), KEY 1948 AESENC KEY STATE 1949 movaps 0x30(TKEYP), KEY 1950 AESENC KEY STATE 1951 movaps 0x40(TKEYP), KEY 1952 AESENC KEY STATE 1953 movaps 0x50(TKEYP), KEY 1954 AESENC KEY STATE 1955 movaps 0x60(TKEYP), KEY 1956 AESENC KEY STATE 1957 movaps 0x70(TKEYP), KEY 1958 AESENCLAST KEY STATE 1959 ret 1960 1961/* 1962 * _aesni_enc4: internal ABI 1963 * input: 1964 * KEYP: key struct pointer 1965 * KLEN: round count 1966 * STATE1: initial state (input) 1967 * STATE2 1968 * STATE3 1969 * STATE4 1970 * output: 1971 * STATE1: finial state (output) 1972 * STATE2 1973 * STATE3 1974 * STATE4 1975 * changed: 1976 * KEY 1977 * TKEYP (T1) 1978 */ 1979.align 4 1980_aesni_enc4: 1981 movaps (KEYP), KEY # key 1982 mov KEYP, TKEYP 1983 pxor KEY, STATE1 # round 0 1984 pxor KEY, STATE2 1985 pxor KEY, STATE3 1986 pxor KEY, STATE4 1987 add $0x30, TKEYP 1988 cmp $24, KLEN 1989 jb .L4enc128 1990 lea 0x20(TKEYP), TKEYP 1991 je .L4enc192 1992 add $0x20, TKEYP 1993 movaps -0x60(TKEYP), KEY 1994 AESENC KEY STATE1 1995 AESENC KEY STATE2 1996 AESENC KEY STATE3 1997 AESENC KEY STATE4 1998 movaps -0x50(TKEYP), KEY 1999 AESENC KEY STATE1 2000 AESENC KEY STATE2 2001 AESENC KEY STATE3 2002 AESENC KEY STATE4 2003#.align 4 2004.L4enc192: 2005 movaps -0x40(TKEYP), KEY 2006 AESENC KEY STATE1 2007 AESENC KEY STATE2 2008 AESENC KEY STATE3 2009 AESENC KEY STATE4 2010 movaps -0x30(TKEYP), KEY 2011 AESENC KEY STATE1 2012 AESENC KEY STATE2 2013 AESENC KEY STATE3 2014 AESENC KEY STATE4 2015#.align 4 2016.L4enc128: 2017 movaps -0x20(TKEYP), KEY 2018 AESENC KEY STATE1 2019 AESENC KEY STATE2 2020 AESENC KEY STATE3 2021 AESENC KEY STATE4 2022 movaps -0x10(TKEYP), KEY 2023 AESENC KEY STATE1 2024 AESENC KEY STATE2 2025 AESENC KEY STATE3 2026 AESENC KEY STATE4 2027 movaps (TKEYP), KEY 2028 AESENC KEY STATE1 2029 AESENC KEY STATE2 2030 AESENC KEY STATE3 2031 AESENC KEY STATE4 2032 movaps 0x10(TKEYP), KEY 2033 AESENC KEY STATE1 2034 AESENC KEY STATE2 2035 AESENC KEY STATE3 2036 AESENC KEY STATE4 2037 movaps 0x20(TKEYP), KEY 2038 AESENC KEY STATE1 2039 AESENC KEY STATE2 2040 AESENC KEY STATE3 2041 AESENC KEY STATE4 2042 movaps 0x30(TKEYP), KEY 2043 AESENC KEY STATE1 2044 AESENC KEY STATE2 2045 AESENC KEY STATE3 2046 AESENC KEY STATE4 2047 movaps 0x40(TKEYP), KEY 2048 AESENC KEY STATE1 2049 AESENC KEY STATE2 2050 AESENC KEY STATE3 2051 AESENC KEY STATE4 2052 movaps 0x50(TKEYP), KEY 2053 AESENC KEY STATE1 2054 AESENC KEY STATE2 2055 AESENC KEY STATE3 2056 AESENC KEY STATE4 2057 movaps 0x60(TKEYP), KEY 2058 AESENC KEY STATE1 2059 AESENC KEY STATE2 2060 AESENC KEY STATE3 2061 AESENC KEY STATE4 2062 movaps 0x70(TKEYP), KEY 2063 AESENCLAST KEY STATE1 # last round 2064 AESENCLAST KEY STATE2 2065 AESENCLAST KEY STATE3 2066 AESENCLAST KEY STATE4 2067 ret 2068 2069/* 2070 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2071 */ 2072ENTRY(aesni_dec) 2073#ifndef __x86_64__ 2074 pushl KEYP 2075 pushl KLEN 2076 movl 12(%esp), KEYP 2077 movl 16(%esp), OUTP 2078 movl 20(%esp), INP 2079#endif 2080 mov 480(KEYP), KLEN # key length 2081 add $240, KEYP 2082 movups (INP), STATE # input 2083 call _aesni_dec1 2084 movups STATE, (OUTP) #output 2085#ifndef __x86_64__ 2086 popl KLEN 2087 popl KEYP 2088#endif 2089 ret 2090 2091/* 2092 * _aesni_dec1: internal ABI 2093 * input: 2094 * KEYP: key struct pointer 2095 * KLEN: key length 2096 * STATE: initial state (input) 2097 * output: 2098 * STATE: finial state (output) 2099 * changed: 2100 * KEY 2101 * TKEYP (T1) 2102 */ 2103.align 4 2104_aesni_dec1: 2105 movaps (KEYP), KEY # key 2106 mov KEYP, TKEYP 2107 pxor KEY, STATE # round 0 2108 add $0x30, TKEYP 2109 cmp $24, KLEN 2110 jb .Ldec128 2111 lea 0x20(TKEYP), TKEYP 2112 je .Ldec192 2113 add $0x20, TKEYP 2114 movaps -0x60(TKEYP), KEY 2115 AESDEC KEY STATE 2116 movaps -0x50(TKEYP), KEY 2117 AESDEC KEY STATE 2118.align 4 2119.Ldec192: 2120 movaps -0x40(TKEYP), KEY 2121 AESDEC KEY STATE 2122 movaps -0x30(TKEYP), KEY 2123 AESDEC KEY STATE 2124.align 4 2125.Ldec128: 2126 movaps -0x20(TKEYP), KEY 2127 AESDEC KEY STATE 2128 movaps -0x10(TKEYP), KEY 2129 AESDEC KEY STATE 2130 movaps (TKEYP), KEY 2131 AESDEC KEY STATE 2132 movaps 0x10(TKEYP), KEY 2133 AESDEC KEY STATE 2134 movaps 0x20(TKEYP), KEY 2135 AESDEC KEY STATE 2136 movaps 0x30(TKEYP), KEY 2137 AESDEC KEY STATE 2138 movaps 0x40(TKEYP), KEY 2139 AESDEC KEY STATE 2140 movaps 0x50(TKEYP), KEY 2141 AESDEC KEY STATE 2142 movaps 0x60(TKEYP), KEY 2143 AESDEC KEY STATE 2144 movaps 0x70(TKEYP), KEY 2145 AESDECLAST KEY STATE 2146 ret 2147 2148/* 2149 * _aesni_dec4: internal ABI 2150 * input: 2151 * KEYP: key struct pointer 2152 * KLEN: key length 2153 * STATE1: initial state (input) 2154 * STATE2 2155 * STATE3 2156 * STATE4 2157 * output: 2158 * STATE1: finial state (output) 2159 * STATE2 2160 * STATE3 2161 * STATE4 2162 * changed: 2163 * KEY 2164 * TKEYP (T1) 2165 */ 2166.align 4 2167_aesni_dec4: 2168 movaps (KEYP), KEY # key 2169 mov KEYP, TKEYP 2170 pxor KEY, STATE1 # round 0 2171 pxor KEY, STATE2 2172 pxor KEY, STATE3 2173 pxor KEY, STATE4 2174 add $0x30, TKEYP 2175 cmp $24, KLEN 2176 jb .L4dec128 2177 lea 0x20(TKEYP), TKEYP 2178 je .L4dec192 2179 add $0x20, TKEYP 2180 movaps -0x60(TKEYP), KEY 2181 AESDEC KEY STATE1 2182 AESDEC KEY STATE2 2183 AESDEC KEY STATE3 2184 AESDEC KEY STATE4 2185 movaps -0x50(TKEYP), KEY 2186 AESDEC KEY STATE1 2187 AESDEC KEY STATE2 2188 AESDEC KEY STATE3 2189 AESDEC KEY STATE4 2190.align 4 2191.L4dec192: 2192 movaps -0x40(TKEYP), KEY 2193 AESDEC KEY STATE1 2194 AESDEC KEY STATE2 2195 AESDEC KEY STATE3 2196 AESDEC KEY STATE4 2197 movaps -0x30(TKEYP), KEY 2198 AESDEC KEY STATE1 2199 AESDEC KEY STATE2 2200 AESDEC KEY STATE3 2201 AESDEC KEY STATE4 2202.align 4 2203.L4dec128: 2204 movaps -0x20(TKEYP), KEY 2205 AESDEC KEY STATE1 2206 AESDEC KEY STATE2 2207 AESDEC KEY STATE3 2208 AESDEC KEY STATE4 2209 movaps -0x10(TKEYP), KEY 2210 AESDEC KEY STATE1 2211 AESDEC KEY STATE2 2212 AESDEC KEY STATE3 2213 AESDEC KEY STATE4 2214 movaps (TKEYP), KEY 2215 AESDEC KEY STATE1 2216 AESDEC KEY STATE2 2217 AESDEC KEY STATE3 2218 AESDEC KEY STATE4 2219 movaps 0x10(TKEYP), KEY 2220 AESDEC KEY STATE1 2221 AESDEC KEY STATE2 2222 AESDEC KEY STATE3 2223 AESDEC KEY STATE4 2224 movaps 0x20(TKEYP), KEY 2225 AESDEC KEY STATE1 2226 AESDEC KEY STATE2 2227 AESDEC KEY STATE3 2228 AESDEC KEY STATE4 2229 movaps 0x30(TKEYP), KEY 2230 AESDEC KEY STATE1 2231 AESDEC KEY STATE2 2232 AESDEC KEY STATE3 2233 AESDEC KEY STATE4 2234 movaps 0x40(TKEYP), KEY 2235 AESDEC KEY STATE1 2236 AESDEC KEY STATE2 2237 AESDEC KEY STATE3 2238 AESDEC KEY STATE4 2239 movaps 0x50(TKEYP), KEY 2240 AESDEC KEY STATE1 2241 AESDEC KEY STATE2 2242 AESDEC KEY STATE3 2243 AESDEC KEY STATE4 2244 movaps 0x60(TKEYP), KEY 2245 AESDEC KEY STATE1 2246 AESDEC KEY STATE2 2247 AESDEC KEY STATE3 2248 AESDEC KEY STATE4 2249 movaps 0x70(TKEYP), KEY 2250 AESDECLAST KEY STATE1 # last round 2251 AESDECLAST KEY STATE2 2252 AESDECLAST KEY STATE3 2253 AESDECLAST KEY STATE4 2254 ret 2255 2256/* 2257 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2258 * size_t len) 2259 */ 2260ENTRY(aesni_ecb_enc) 2261#ifndef __x86_64__ 2262 pushl LEN 2263 pushl KEYP 2264 pushl KLEN 2265 movl 16(%esp), KEYP 2266 movl 20(%esp), OUTP 2267 movl 24(%esp), INP 2268 movl 28(%esp), LEN 2269#endif 2270 test LEN, LEN # check length 2271 jz .Lecb_enc_ret 2272 mov 480(KEYP), KLEN 2273 cmp $16, LEN 2274 jb .Lecb_enc_ret 2275 cmp $64, LEN 2276 jb .Lecb_enc_loop1 2277.align 4 2278.Lecb_enc_loop4: 2279 movups (INP), STATE1 2280 movups 0x10(INP), STATE2 2281 movups 0x20(INP), STATE3 2282 movups 0x30(INP), STATE4 2283 call _aesni_enc4 2284 movups STATE1, (OUTP) 2285 movups STATE2, 0x10(OUTP) 2286 movups STATE3, 0x20(OUTP) 2287 movups STATE4, 0x30(OUTP) 2288 sub $64, LEN 2289 add $64, INP 2290 add $64, OUTP 2291 cmp $64, LEN 2292 jge .Lecb_enc_loop4 2293 cmp $16, LEN 2294 jb .Lecb_enc_ret 2295.align 4 2296.Lecb_enc_loop1: 2297 movups (INP), STATE1 2298 call _aesni_enc1 2299 movups STATE1, (OUTP) 2300 sub $16, LEN 2301 add $16, INP 2302 add $16, OUTP 2303 cmp $16, LEN 2304 jge .Lecb_enc_loop1 2305.Lecb_enc_ret: 2306#ifndef __x86_64__ 2307 popl KLEN 2308 popl KEYP 2309 popl LEN 2310#endif 2311 ret 2312 2313/* 2314 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2315 * size_t len); 2316 */ 2317ENTRY(aesni_ecb_dec) 2318#ifndef __x86_64__ 2319 pushl LEN 2320 pushl KEYP 2321 pushl KLEN 2322 movl 16(%esp), KEYP 2323 movl 20(%esp), OUTP 2324 movl 24(%esp), INP 2325 movl 28(%esp), LEN 2326#endif 2327 test LEN, LEN 2328 jz .Lecb_dec_ret 2329 mov 480(KEYP), KLEN 2330 add $240, KEYP 2331 cmp $16, LEN 2332 jb .Lecb_dec_ret 2333 cmp $64, LEN 2334 jb .Lecb_dec_loop1 2335.align 4 2336.Lecb_dec_loop4: 2337 movups (INP), STATE1 2338 movups 0x10(INP), STATE2 2339 movups 0x20(INP), STATE3 2340 movups 0x30(INP), STATE4 2341 call _aesni_dec4 2342 movups STATE1, (OUTP) 2343 movups STATE2, 0x10(OUTP) 2344 movups STATE3, 0x20(OUTP) 2345 movups STATE4, 0x30(OUTP) 2346 sub $64, LEN 2347 add $64, INP 2348 add $64, OUTP 2349 cmp $64, LEN 2350 jge .Lecb_dec_loop4 2351 cmp $16, LEN 2352 jb .Lecb_dec_ret 2353.align 4 2354.Lecb_dec_loop1: 2355 movups (INP), STATE1 2356 call _aesni_dec1 2357 movups STATE1, (OUTP) 2358 sub $16, LEN 2359 add $16, INP 2360 add $16, OUTP 2361 cmp $16, LEN 2362 jge .Lecb_dec_loop1 2363.Lecb_dec_ret: 2364#ifndef __x86_64__ 2365 popl KLEN 2366 popl KEYP 2367 popl LEN 2368#endif 2369 ret 2370 2371/* 2372 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2373 * size_t len, u8 *iv) 2374 */ 2375ENTRY(aesni_cbc_enc) 2376#ifndef __x86_64__ 2377 pushl IVP 2378 pushl LEN 2379 pushl KEYP 2380 pushl KLEN 2381 movl 20(%esp), KEYP 2382 movl 24(%esp), OUTP 2383 movl 28(%esp), INP 2384 movl 32(%esp), LEN 2385 movl 36(%esp), IVP 2386#endif 2387 cmp $16, LEN 2388 jb .Lcbc_enc_ret 2389 mov 480(KEYP), KLEN 2390 movups (IVP), STATE # load iv as initial state 2391.align 4 2392.Lcbc_enc_loop: 2393 movups (INP), IN # load input 2394 pxor IN, STATE 2395 call _aesni_enc1 2396 movups STATE, (OUTP) # store output 2397 sub $16, LEN 2398 add $16, INP 2399 add $16, OUTP 2400 cmp $16, LEN 2401 jge .Lcbc_enc_loop 2402 movups STATE, (IVP) 2403.Lcbc_enc_ret: 2404#ifndef __x86_64__ 2405 popl KLEN 2406 popl KEYP 2407 popl LEN 2408 popl IVP 2409#endif 2410 ret 2411 2412/* 2413 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2414 * size_t len, u8 *iv) 2415 */ 2416ENTRY(aesni_cbc_dec) 2417#ifndef __x86_64__ 2418 pushl IVP 2419 pushl LEN 2420 pushl KEYP 2421 pushl KLEN 2422 movl 20(%esp), KEYP 2423 movl 24(%esp), OUTP 2424 movl 28(%esp), INP 2425 movl 32(%esp), LEN 2426 movl 36(%esp), IVP 2427#endif 2428 cmp $16, LEN 2429 jb .Lcbc_dec_just_ret 2430 mov 480(KEYP), KLEN 2431 add $240, KEYP 2432 movups (IVP), IV 2433 cmp $64, LEN 2434 jb .Lcbc_dec_loop1 2435.align 4 2436.Lcbc_dec_loop4: 2437 movups (INP), IN1 2438 movaps IN1, STATE1 2439 movups 0x10(INP), IN2 2440 movaps IN2, STATE2 2441#ifdef __x86_64__ 2442 movups 0x20(INP), IN3 2443 movaps IN3, STATE3 2444 movups 0x30(INP), IN4 2445 movaps IN4, STATE4 2446#else 2447 movups 0x20(INP), IN1 2448 movaps IN1, STATE3 2449 movups 0x30(INP), IN2 2450 movaps IN2, STATE4 2451#endif 2452 call _aesni_dec4 2453 pxor IV, STATE1 2454#ifdef __x86_64__ 2455 pxor IN1, STATE2 2456 pxor IN2, STATE3 2457 pxor IN3, STATE4 2458 movaps IN4, IV 2459#else 2460 pxor (INP), STATE2 2461 pxor 0x10(INP), STATE3 2462 pxor IN1, STATE4 2463 movaps IN2, IV 2464#endif 2465 movups STATE1, (OUTP) 2466 movups STATE2, 0x10(OUTP) 2467 movups STATE3, 0x20(OUTP) 2468 movups STATE4, 0x30(OUTP) 2469 sub $64, LEN 2470 add $64, INP 2471 add $64, OUTP 2472 cmp $64, LEN 2473 jge .Lcbc_dec_loop4 2474 cmp $16, LEN 2475 jb .Lcbc_dec_ret 2476.align 4 2477.Lcbc_dec_loop1: 2478 movups (INP), IN 2479 movaps IN, STATE 2480 call _aesni_dec1 2481 pxor IV, STATE 2482 movups STATE, (OUTP) 2483 movaps IN, IV 2484 sub $16, LEN 2485 add $16, INP 2486 add $16, OUTP 2487 cmp $16, LEN 2488 jge .Lcbc_dec_loop1 2489.Lcbc_dec_ret: 2490 movups IV, (IVP) 2491.Lcbc_dec_just_ret: 2492#ifndef __x86_64__ 2493 popl KLEN 2494 popl KEYP 2495 popl LEN 2496 popl IVP 2497#endif 2498 ret 2499 2500#ifdef __x86_64__ 2501.align 16 2502.Lbswap_mask: 2503 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2504 2505/* 2506 * _aesni_inc_init: internal ABI 2507 * setup registers used by _aesni_inc 2508 * input: 2509 * IV 2510 * output: 2511 * CTR: == IV, in little endian 2512 * TCTR_LOW: == lower qword of CTR 2513 * INC: == 1, in little endian 2514 * BSWAP_MASK == endian swapping mask 2515 */ 2516.align 4 2517_aesni_inc_init: 2518 movaps .Lbswap_mask, BSWAP_MASK 2519 movaps IV, CTR 2520 PSHUFB_XMM BSWAP_MASK CTR 2521 mov $1, TCTR_LOW 2522 MOVQ_R64_XMM TCTR_LOW INC 2523 MOVQ_R64_XMM CTR TCTR_LOW 2524 ret 2525 2526/* 2527 * _aesni_inc: internal ABI 2528 * Increase IV by 1, IV is in big endian 2529 * input: 2530 * IV 2531 * CTR: == IV, in little endian 2532 * TCTR_LOW: == lower qword of CTR 2533 * INC: == 1, in little endian 2534 * BSWAP_MASK == endian swapping mask 2535 * output: 2536 * IV: Increase by 1 2537 * changed: 2538 * CTR: == output IV, in little endian 2539 * TCTR_LOW: == lower qword of CTR 2540 */ 2541.align 4 2542_aesni_inc: 2543 paddq INC, CTR 2544 add $1, TCTR_LOW 2545 jnc .Linc_low 2546 pslldq $8, INC 2547 paddq INC, CTR 2548 psrldq $8, INC 2549.Linc_low: 2550 movaps CTR, IV 2551 PSHUFB_XMM BSWAP_MASK IV 2552 ret 2553 2554/* 2555 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2556 * size_t len, u8 *iv) 2557 */ 2558ENTRY(aesni_ctr_enc) 2559 cmp $16, LEN 2560 jb .Lctr_enc_just_ret 2561 mov 480(KEYP), KLEN 2562 movups (IVP), IV 2563 call _aesni_inc_init 2564 cmp $64, LEN 2565 jb .Lctr_enc_loop1 2566.align 4 2567.Lctr_enc_loop4: 2568 movaps IV, STATE1 2569 call _aesni_inc 2570 movups (INP), IN1 2571 movaps IV, STATE2 2572 call _aesni_inc 2573 movups 0x10(INP), IN2 2574 movaps IV, STATE3 2575 call _aesni_inc 2576 movups 0x20(INP), IN3 2577 movaps IV, STATE4 2578 call _aesni_inc 2579 movups 0x30(INP), IN4 2580 call _aesni_enc4 2581 pxor IN1, STATE1 2582 movups STATE1, (OUTP) 2583 pxor IN2, STATE2 2584 movups STATE2, 0x10(OUTP) 2585 pxor IN3, STATE3 2586 movups STATE3, 0x20(OUTP) 2587 pxor IN4, STATE4 2588 movups STATE4, 0x30(OUTP) 2589 sub $64, LEN 2590 add $64, INP 2591 add $64, OUTP 2592 cmp $64, LEN 2593 jge .Lctr_enc_loop4 2594 cmp $16, LEN 2595 jb .Lctr_enc_ret 2596.align 4 2597.Lctr_enc_loop1: 2598 movaps IV, STATE 2599 call _aesni_inc 2600 movups (INP), IN 2601 call _aesni_enc1 2602 pxor IN, STATE 2603 movups STATE, (OUTP) 2604 sub $16, LEN 2605 add $16, INP 2606 add $16, OUTP 2607 cmp $16, LEN 2608 jge .Lctr_enc_loop1 2609.Lctr_enc_ret: 2610 movups IV, (IVP) 2611.Lctr_enc_just_ret: 2612 ret 2613#endif 2614