1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 * interface for 64-bit kernels. 15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 * Adrian Hoban <adrian.hoban@intel.com> 18 * James Guilford (james.guilford@intel.com) 19 * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 * Tadeusz Struk (tadeusz.struk@intel.com) 21 * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 * Copyright (c) 2010, Intel Corporation. 23 * 24 * Ported x86_64 version to x86: 25 * Author: Mathias Krause <minipli@googlemail.com> 26 */ 27 28#include <linux/linkage.h> 29#include <asm/frame.h> 30#include <asm/nospec-branch.h> 31 32/* 33 * The following macros are used to move an (un)aligned 16 byte value to/from 34 * an XMM register. This can done for either FP or integer values, for FP use 35 * movaps (move aligned packed single) or integer use movdqa (move double quad 36 * aligned). It doesn't make a performance difference which instruction is used 37 * since Nehalem (original Core i7) was released. However, the movaps is a byte 38 * shorter, so that is the one we'll use for now. (same for unaligned). 39 */ 40#define MOVADQ movaps 41#define MOVUDQ movups 42 43#ifdef __x86_64__ 44 45# constants in mergeable sections, linker can reorder and merge 46.section .rodata.cst16.POLY, "aM", @progbits, 16 47.align 16 48POLY: .octa 0xC2000000000000000000000000000001 49.section .rodata.cst16.TWOONE, "aM", @progbits, 16 50.align 16 51TWOONE: .octa 0x00000001000000000000000000000001 52 53.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 54.align 16 55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 56.section .rodata.cst16.MASK1, "aM", @progbits, 16 57.align 16 58MASK1: .octa 0x0000000000000000ffffffffffffffff 59.section .rodata.cst16.MASK2, "aM", @progbits, 16 60.align 16 61MASK2: .octa 0xffffffffffffffff0000000000000000 62.section .rodata.cst16.ONE, "aM", @progbits, 16 63.align 16 64ONE: .octa 0x00000000000000000000000000000001 65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 66.align 16 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 68.section .rodata.cst16.dec, "aM", @progbits, 16 69.align 16 70dec: .octa 0x1 71.section .rodata.cst16.enc, "aM", @progbits, 16 72.align 16 73enc: .octa 0x2 74 75# order of these constants should not change. 76# more specifically, ALL_F should follow SHIFT_MASK, 77# and zero should follow ALL_F 78.section .rodata, "a", @progbits 79.align 16 80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 81ALL_F: .octa 0xffffffffffffffffffffffffffffffff 82 .octa 0x00000000000000000000000000000000 83 84.text 85 86 87#define STACK_OFFSET 8*3 88 89#define AadHash 16*0 90#define AadLen 16*1 91#define InLen (16*1)+8 92#define PBlockEncKey 16*2 93#define OrigIV 16*3 94#define CurCount 16*4 95#define PBlockLen 16*5 96#define HashKey 16*6 // store HashKey <<1 mod poly here 97#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 98#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 99#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 100#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 101 // bits of HashKey <<1 mod poly here 102 //(for Karatsuba purposes) 103#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 104 // bits of HashKey^2 <<1 mod poly here 105 // (for Karatsuba purposes) 106#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 107 // bits of HashKey^3 <<1 mod poly here 108 // (for Karatsuba purposes) 109#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 110 // bits of HashKey^4 <<1 mod poly here 111 // (for Karatsuba purposes) 112 113#define arg1 rdi 114#define arg2 rsi 115#define arg3 rdx 116#define arg4 rcx 117#define arg5 r8 118#define arg6 r9 119#define arg7 STACK_OFFSET+8(%rsp) 120#define arg8 STACK_OFFSET+16(%rsp) 121#define arg9 STACK_OFFSET+24(%rsp) 122#define arg10 STACK_OFFSET+32(%rsp) 123#define arg11 STACK_OFFSET+40(%rsp) 124#define keysize 2*15*16(%arg1) 125#endif 126 127 128#define STATE1 %xmm0 129#define STATE2 %xmm4 130#define STATE3 %xmm5 131#define STATE4 %xmm6 132#define STATE STATE1 133#define IN1 %xmm1 134#define IN2 %xmm7 135#define IN3 %xmm8 136#define IN4 %xmm9 137#define IN IN1 138#define KEY %xmm2 139#define IV %xmm3 140 141#define BSWAP_MASK %xmm10 142#define CTR %xmm11 143#define INC %xmm12 144 145#define GF128MUL_MASK %xmm7 146 147#ifdef __x86_64__ 148#define AREG %rax 149#define KEYP %rdi 150#define OUTP %rsi 151#define UKEYP OUTP 152#define INP %rdx 153#define LEN %rcx 154#define IVP %r8 155#define KLEN %r9d 156#define T1 %r10 157#define TKEYP T1 158#define T2 %r11 159#define TCTR_LOW T2 160#else 161#define AREG %eax 162#define KEYP %edi 163#define OUTP AREG 164#define UKEYP OUTP 165#define INP %edx 166#define LEN %esi 167#define IVP %ebp 168#define KLEN %ebx 169#define T1 %ecx 170#define TKEYP T1 171#endif 172 173.macro FUNC_SAVE 174 push %r12 175 push %r13 176 push %r14 177# 178# states of %xmm registers %xmm6:%xmm15 not saved 179# all %xmm registers are clobbered 180# 181.endm 182 183 184.macro FUNC_RESTORE 185 pop %r14 186 pop %r13 187 pop %r12 188.endm 189 190# Precompute hashkeys. 191# Input: Hash subkey. 192# Output: HashKeys stored in gcm_context_data. Only needs to be called 193# once per key. 194# clobbers r12, and tmp xmm registers. 195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 196 mov \SUBKEY, %r12 197 movdqu (%r12), \TMP3 198 movdqa SHUF_MASK(%rip), \TMP2 199 pshufb \TMP2, \TMP3 200 201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 202 203 movdqa \TMP3, \TMP2 204 psllq $1, \TMP3 205 psrlq $63, \TMP2 206 movdqa \TMP2, \TMP1 207 pslldq $8, \TMP2 208 psrldq $8, \TMP1 209 por \TMP2, \TMP3 210 211 # reduce HashKey<<1 212 213 pshufd $0x24, \TMP1, \TMP2 214 pcmpeqd TWOONE(%rip), \TMP2 215 pand POLY(%rip), \TMP2 216 pxor \TMP2, \TMP3 217 movdqu \TMP3, HashKey(%arg2) 218 219 movdqa \TMP3, \TMP5 220 pshufd $78, \TMP3, \TMP1 221 pxor \TMP3, \TMP1 222 movdqu \TMP1, HashKey_k(%arg2) 223 224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 225# TMP5 = HashKey^2<<1 (mod poly) 226 movdqu \TMP5, HashKey_2(%arg2) 227# HashKey_2 = HashKey^2<<1 (mod poly) 228 pshufd $78, \TMP5, \TMP1 229 pxor \TMP5, \TMP1 230 movdqu \TMP1, HashKey_2_k(%arg2) 231 232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 233# TMP5 = HashKey^3<<1 (mod poly) 234 movdqu \TMP5, HashKey_3(%arg2) 235 pshufd $78, \TMP5, \TMP1 236 pxor \TMP5, \TMP1 237 movdqu \TMP1, HashKey_3_k(%arg2) 238 239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 240# TMP5 = HashKey^3<<1 (mod poly) 241 movdqu \TMP5, HashKey_4(%arg2) 242 pshufd $78, \TMP5, \TMP1 243 pxor \TMP5, \TMP1 244 movdqu \TMP1, HashKey_4_k(%arg2) 245.endm 246 247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 249.macro GCM_INIT Iv SUBKEY AAD AADLEN 250 mov \AADLEN, %r11 251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 252 xor %r11d, %r11d 253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 256 mov \Iv, %rax 257 movdqu (%rax), %xmm0 258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 259 260 movdqa SHUF_MASK(%rip), %xmm2 261 pshufb %xmm2, %xmm0 262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 263 264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 265 movdqu HashKey(%arg2), %xmm13 266 267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 268 %xmm4, %xmm5, %xmm6 269.endm 270 271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 272# struct has been initialized by GCM_INIT. 273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 274# Clobbers rax, r10-r13, and xmm0-xmm15 275.macro GCM_ENC_DEC operation 276 movdqu AadHash(%arg2), %xmm8 277 movdqu HashKey(%arg2), %xmm13 278 add %arg5, InLen(%arg2) 279 280 xor %r11d, %r11d # initialise the data pointer offset as zero 281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 282 283 sub %r11, %arg5 # sub partial block data used 284 mov %arg5, %r13 # save the number of bytes 285 286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 287 mov %r13, %r12 288 # Encrypt/Decrypt first few blocks 289 290 and $(3<<4), %r12 291 jz _initial_num_blocks_is_0_\@ 292 cmp $(2<<4), %r12 293 jb _initial_num_blocks_is_1_\@ 294 je _initial_num_blocks_is_2_\@ 295_initial_num_blocks_is_3_\@: 296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 298 sub $48, %r13 299 jmp _initial_blocks_\@ 300_initial_num_blocks_is_2_\@: 301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 303 sub $32, %r13 304 jmp _initial_blocks_\@ 305_initial_num_blocks_is_1_\@: 306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 308 sub $16, %r13 309 jmp _initial_blocks_\@ 310_initial_num_blocks_is_0_\@: 311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 313_initial_blocks_\@: 314 315 # Main loop - Encrypt/Decrypt remaining blocks 316 317 test %r13, %r13 318 je _zero_cipher_left_\@ 319 sub $64, %r13 320 je _four_cipher_left_\@ 321_crypt_by_4_\@: 322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 324 %xmm7, %xmm8, enc 325 add $64, %r11 326 sub $64, %r13 327 jne _crypt_by_4_\@ 328_four_cipher_left_\@: 329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 331_zero_cipher_left_\@: 332 movdqu %xmm8, AadHash(%arg2) 333 movdqu %xmm0, CurCount(%arg2) 334 335 mov %arg5, %r13 336 and $15, %r13 # %r13 = arg5 (mod 16) 337 je _multiple_of_16_bytes_\@ 338 339 mov %r13, PBlockLen(%arg2) 340 341 # Handle the last <16 Byte block separately 342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 343 movdqu %xmm0, CurCount(%arg2) 344 movdqa SHUF_MASK(%rip), %xmm10 345 pshufb %xmm10, %xmm0 346 347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 348 movdqu %xmm0, PBlockEncKey(%arg2) 349 350 cmp $16, %arg5 351 jge _large_enough_update_\@ 352 353 lea (%arg4,%r11,1), %r10 354 mov %r13, %r12 355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 356 jmp _data_read_\@ 357 358_large_enough_update_\@: 359 sub $16, %r11 360 add %r13, %r11 361 362 # receive the last <16 Byte block 363 movdqu (%arg4, %r11, 1), %xmm1 364 365 sub %r13, %r11 366 add $16, %r11 367 368 lea SHIFT_MASK+16(%rip), %r12 369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 370 # (r13 is the number of bytes in plaintext mod 16) 371 sub %r13, %r12 372 # get the appropriate shuffle mask 373 movdqu (%r12), %xmm2 374 # shift right 16-r13 bytes 375 pshufb %xmm2, %xmm1 376 377_data_read_\@: 378 lea ALL_F+16(%rip), %r12 379 sub %r13, %r12 380 381.ifc \operation, dec 382 movdqa %xmm1, %xmm2 383.endif 384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 385 movdqu (%r12), %xmm1 386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 388.ifc \operation, dec 389 pand %xmm1, %xmm2 390 movdqa SHUF_MASK(%rip), %xmm10 391 pshufb %xmm10 ,%xmm2 392 393 pxor %xmm2, %xmm8 394.else 395 movdqa SHUF_MASK(%rip), %xmm10 396 pshufb %xmm10,%xmm0 397 398 pxor %xmm0, %xmm8 399.endif 400 401 movdqu %xmm8, AadHash(%arg2) 402.ifc \operation, enc 403 # GHASH computation for the last <16 byte block 404 movdqa SHUF_MASK(%rip), %xmm10 405 # shuffle xmm0 back to output as ciphertext 406 pshufb %xmm10, %xmm0 407.endif 408 409 # Output %r13 bytes 410 movq %xmm0, %rax 411 cmp $8, %r13 412 jle _less_than_8_bytes_left_\@ 413 mov %rax, (%arg3 , %r11, 1) 414 add $8, %r11 415 psrldq $8, %xmm0 416 movq %xmm0, %rax 417 sub $8, %r13 418_less_than_8_bytes_left_\@: 419 mov %al, (%arg3, %r11, 1) 420 add $1, %r11 421 shr $8, %rax 422 sub $1, %r13 423 jne _less_than_8_bytes_left_\@ 424_multiple_of_16_bytes_\@: 425.endm 426 427# GCM_COMPLETE Finishes update of tag of last partial block 428# Output: Authorization Tag (AUTH_TAG) 429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 431 movdqu AadHash(%arg2), %xmm8 432 movdqu HashKey(%arg2), %xmm13 433 434 mov PBlockLen(%arg2), %r12 435 436 test %r12, %r12 437 je _partial_done\@ 438 439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 440 441_partial_done\@: 442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 443 shl $3, %r12 # convert into number of bits 444 movd %r12d, %xmm15 # len(A) in %xmm15 445 mov InLen(%arg2), %r12 446 shl $3, %r12 # len(C) in bits (*128) 447 movq %r12, %xmm1 448 449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 451 pxor %xmm15, %xmm8 452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 453 # final GHASH computation 454 movdqa SHUF_MASK(%rip), %xmm10 455 pshufb %xmm10, %xmm8 456 457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 459 pxor %xmm8, %xmm0 460_return_T_\@: 461 mov \AUTHTAG, %r10 # %r10 = authTag 462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 463 cmp $16, %r11 464 je _T_16_\@ 465 cmp $8, %r11 466 jl _T_4_\@ 467_T_8_\@: 468 movq %xmm0, %rax 469 mov %rax, (%r10) 470 add $8, %r10 471 sub $8, %r11 472 psrldq $8, %xmm0 473 test %r11, %r11 474 je _return_T_done_\@ 475_T_4_\@: 476 movd %xmm0, %eax 477 mov %eax, (%r10) 478 add $4, %r10 479 sub $4, %r11 480 psrldq $4, %xmm0 481 test %r11, %r11 482 je _return_T_done_\@ 483_T_123_\@: 484 movd %xmm0, %eax 485 cmp $2, %r11 486 jl _T_1_\@ 487 mov %ax, (%r10) 488 cmp $2, %r11 489 je _return_T_done_\@ 490 add $2, %r10 491 sar $16, %eax 492_T_1_\@: 493 mov %al, (%r10) 494 jmp _return_T_done_\@ 495_T_16_\@: 496 movdqu %xmm0, (%r10) 497_return_T_done_\@: 498.endm 499 500#ifdef __x86_64__ 501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 502* 503* 504* Input: A and B (128-bits each, bit-reflected) 505* Output: C = A*B*x mod poly, (i.e. >>1 ) 506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 508* 509*/ 510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 511 movdqa \GH, \TMP1 512 pshufd $78, \GH, \TMP2 513 pshufd $78, \HK, \TMP3 514 pxor \GH, \TMP2 # TMP2 = a1+a0 515 pxor \HK, \TMP3 # TMP3 = b1+b0 516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0 518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 519 pxor \GH, \TMP2 520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 521 movdqa \TMP2, \TMP3 522 pslldq $8, \TMP3 # left shift TMP3 2 DWs 523 psrldq $8, \TMP2 # right shift TMP2 2 DWs 524 pxor \TMP3, \GH 525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 526 527 # first phase of the reduction 528 529 movdqa \GH, \TMP2 530 movdqa \GH, \TMP3 531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 532 # in in order to perform 533 # independent shifts 534 pslld $31, \TMP2 # packed right shift <<31 535 pslld $30, \TMP3 # packed right shift <<30 536 pslld $25, \TMP4 # packed right shift <<25 537 pxor \TMP3, \TMP2 # xor the shifted versions 538 pxor \TMP4, \TMP2 539 movdqa \TMP2, \TMP5 540 psrldq $4, \TMP5 # right shift TMP5 1 DW 541 pslldq $12, \TMP2 # left shift TMP2 3 DWs 542 pxor \TMP2, \GH 543 544 # second phase of the reduction 545 546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 547 # in in order to perform 548 # independent shifts 549 movdqa \GH,\TMP3 550 movdqa \GH,\TMP4 551 psrld $1,\TMP2 # packed left shift >>1 552 psrld $2,\TMP3 # packed left shift >>2 553 psrld $7,\TMP4 # packed left shift >>7 554 pxor \TMP3,\TMP2 # xor the shifted versions 555 pxor \TMP4,\TMP2 556 pxor \TMP5, \TMP2 557 pxor \TMP2, \GH 558 pxor \TMP1, \GH # result is in TMP1 559.endm 560 561# Reads DLEN bytes starting at DPTR and stores in XMMDst 562# where 0 < DLEN < 16 563# Clobbers %rax, DLEN and XMM1 564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 565 cmp $8, \DLEN 566 jl _read_lt8_\@ 567 mov (\DPTR), %rax 568 movq %rax, \XMMDst 569 sub $8, \DLEN 570 jz _done_read_partial_block_\@ 571 xor %eax, %eax 572_read_next_byte_\@: 573 shl $8, %rax 574 mov 7(\DPTR, \DLEN, 1), %al 575 dec \DLEN 576 jnz _read_next_byte_\@ 577 movq %rax, \XMM1 578 pslldq $8, \XMM1 579 por \XMM1, \XMMDst 580 jmp _done_read_partial_block_\@ 581_read_lt8_\@: 582 xor %eax, %eax 583_read_next_byte_lt8_\@: 584 shl $8, %rax 585 mov -1(\DPTR, \DLEN, 1), %al 586 dec \DLEN 587 jnz _read_next_byte_lt8_\@ 588 movq %rax, \XMMDst 589_done_read_partial_block_\@: 590.endm 591 592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 593# clobbers r10-11, xmm14 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 595 TMP6 TMP7 596 MOVADQ SHUF_MASK(%rip), %xmm14 597 mov \AAD, %r10 # %r10 = AAD 598 mov \AADLEN, %r11 # %r11 = aadLen 599 pxor \TMP7, \TMP7 600 pxor \TMP6, \TMP6 601 602 cmp $16, %r11 603 jl _get_AAD_rest\@ 604_get_AAD_blocks\@: 605 movdqu (%r10), \TMP7 606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 607 pxor \TMP7, \TMP6 608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 609 add $16, %r10 610 sub $16, %r11 611 cmp $16, %r11 612 jge _get_AAD_blocks\@ 613 614 movdqu \TMP6, \TMP7 615 616 /* read the last <16B of AAD */ 617_get_AAD_rest\@: 618 test %r11, %r11 619 je _get_AAD_done\@ 620 621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 623 pxor \TMP6, \TMP7 624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 625 movdqu \TMP7, \TMP6 626 627_get_AAD_done\@: 628 movdqu \TMP6, AadHash(%arg2) 629.endm 630 631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 632# between update calls. 633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 637 AAD_HASH operation 638 mov PBlockLen(%arg2), %r13 639 test %r13, %r13 640 je _partial_block_done_\@ # Leave Macro if no partial blocks 641 # Read in input data without over reading 642 cmp $16, \PLAIN_CYPH_LEN 643 jl _fewer_than_16_bytes_\@ 644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 645 jmp _data_read_\@ 646 647_fewer_than_16_bytes_\@: 648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 649 mov \PLAIN_CYPH_LEN, %r12 650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 651 652 mov PBlockLen(%arg2), %r13 653 654_data_read_\@: # Finished reading in data 655 656 movdqu PBlockEncKey(%arg2), %xmm9 657 movdqu HashKey(%arg2), %xmm13 658 659 lea SHIFT_MASK(%rip), %r12 660 661 # adjust the shuffle mask pointer to be able to shift r13 bytes 662 # r16-r13 is the number of bytes in plaintext mod 16) 663 add %r13, %r12 664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 665 pshufb %xmm2, %xmm9 # shift right r13 bytes 666 667.ifc \operation, dec 668 movdqa %xmm1, %xmm3 669 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 670 671 mov \PLAIN_CYPH_LEN, %r10 672 add %r13, %r10 673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 674 sub $16, %r10 675 # Determine if if partial block is not being filled and 676 # shift mask accordingly 677 jge _no_extra_mask_1_\@ 678 sub %r10, %r12 679_no_extra_mask_1_\@: 680 681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 682 # get the appropriate mask to mask out bottom r13 bytes of xmm9 683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 684 685 pand %xmm1, %xmm3 686 movdqa SHUF_MASK(%rip), %xmm10 687 pshufb %xmm10, %xmm3 688 pshufb %xmm2, %xmm3 689 pxor %xmm3, \AAD_HASH 690 691 test %r10, %r10 692 jl _partial_incomplete_1_\@ 693 694 # GHASH computation for the last <16 Byte block 695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 696 xor %eax, %eax 697 698 mov %rax, PBlockLen(%arg2) 699 jmp _dec_done_\@ 700_partial_incomplete_1_\@: 701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 702_dec_done_\@: 703 movdqu \AAD_HASH, AadHash(%arg2) 704.else 705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 706 707 mov \PLAIN_CYPH_LEN, %r10 708 add %r13, %r10 709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 710 sub $16, %r10 711 # Determine if if partial block is not being filled and 712 # shift mask accordingly 713 jge _no_extra_mask_2_\@ 714 sub %r10, %r12 715_no_extra_mask_2_\@: 716 717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 718 # get the appropriate mask to mask out bottom r13 bytes of xmm9 719 pand %xmm1, %xmm9 720 721 movdqa SHUF_MASK(%rip), %xmm1 722 pshufb %xmm1, %xmm9 723 pshufb %xmm2, %xmm9 724 pxor %xmm9, \AAD_HASH 725 726 test %r10, %r10 727 jl _partial_incomplete_2_\@ 728 729 # GHASH computation for the last <16 Byte block 730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 731 xor %eax, %eax 732 733 mov %rax, PBlockLen(%arg2) 734 jmp _encode_done_\@ 735_partial_incomplete_2_\@: 736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 737_encode_done_\@: 738 movdqu \AAD_HASH, AadHash(%arg2) 739 740 movdqa SHUF_MASK(%rip), %xmm10 741 # shuffle xmm9 back to output as ciphertext 742 pshufb %xmm10, %xmm9 743 pshufb %xmm2, %xmm9 744.endif 745 # output encrypted Bytes 746 test %r10, %r10 747 jl _partial_fill_\@ 748 mov %r13, %r12 749 mov $16, %r13 750 # Set r13 to be the number of bytes to write out 751 sub %r12, %r13 752 jmp _count_set_\@ 753_partial_fill_\@: 754 mov \PLAIN_CYPH_LEN, %r13 755_count_set_\@: 756 movdqa %xmm9, %xmm0 757 movq %xmm0, %rax 758 cmp $8, %r13 759 jle _less_than_8_bytes_left_\@ 760 761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 762 add $8, \DATA_OFFSET 763 psrldq $8, %xmm0 764 movq %xmm0, %rax 765 sub $8, %r13 766_less_than_8_bytes_left_\@: 767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 768 add $1, \DATA_OFFSET 769 shr $8, %rax 770 sub $1, %r13 771 jne _less_than_8_bytes_left_\@ 772_partial_block_done_\@: 773.endm # PARTIAL_BLOCK 774 775/* 776* if a = number of total plaintext bytes 777* b = floor(a/16) 778* num_initial_blocks = b mod 4 779* encrypt the initial num_initial_blocks blocks and apply ghash on 780* the ciphertext 781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 782* are clobbered 783* arg1, %arg2, %arg3 are used as a pointer only, not modified 784*/ 785 786 787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 789 MOVADQ SHUF_MASK(%rip), %xmm14 790 791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 792 793 # start AES for num_initial_blocks blocks 794 795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 796 797.if (\i == 5) || (\i == 6) || (\i == 7) 798 799 MOVADQ ONE(%RIP),\TMP1 800 MOVADQ 0(%arg1),\TMP2 801.irpc index, \i_seq 802 paddd \TMP1, \XMM0 # INCR Y0 803.ifc \operation, dec 804 movdqa \XMM0, %xmm\index 805.else 806 MOVADQ \XMM0, %xmm\index 807.endif 808 pshufb %xmm14, %xmm\index # perform a 16 byte swap 809 pxor \TMP2, %xmm\index 810.endr 811 lea 0x10(%arg1),%r10 812 mov keysize,%eax 813 shr $2,%eax # 128->4, 192->6, 256->8 814 add $5,%eax # 128->9, 192->11, 256->13 815 816aes_loop_initial_\@: 817 MOVADQ (%r10),\TMP1 818.irpc index, \i_seq 819 aesenc \TMP1, %xmm\index 820.endr 821 add $16,%r10 822 sub $1,%eax 823 jnz aes_loop_initial_\@ 824 825 MOVADQ (%r10), \TMP1 826.irpc index, \i_seq 827 aesenclast \TMP1, %xmm\index # Last Round 828.endr 829.irpc index, \i_seq 830 movdqu (%arg4 , %r11, 1), \TMP1 831 pxor \TMP1, %xmm\index 832 movdqu %xmm\index, (%arg3 , %r11, 1) 833 # write back plaintext/ciphertext for num_initial_blocks 834 add $16, %r11 835 836.ifc \operation, dec 837 movdqa \TMP1, %xmm\index 838.endif 839 pshufb %xmm14, %xmm\index 840 841 # prepare plaintext/ciphertext for GHASH computation 842.endr 843.endif 844 845 # apply GHASH on num_initial_blocks blocks 846 847.if \i == 5 848 pxor %xmm5, %xmm6 849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 850 pxor %xmm6, %xmm7 851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 852 pxor %xmm7, %xmm8 853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 854.elseif \i == 6 855 pxor %xmm6, %xmm7 856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 857 pxor %xmm7, %xmm8 858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 859.elseif \i == 7 860 pxor %xmm7, %xmm8 861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 862.endif 863 cmp $64, %r13 864 jl _initial_blocks_done\@ 865 # no need for precomputed values 866/* 867* 868* Precomputations for HashKey parallel with encryption of first 4 blocks. 869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 870*/ 871 MOVADQ ONE(%RIP),\TMP1 872 paddd \TMP1, \XMM0 # INCR Y0 873 MOVADQ \XMM0, \XMM1 874 pshufb %xmm14, \XMM1 # perform a 16 byte swap 875 876 paddd \TMP1, \XMM0 # INCR Y0 877 MOVADQ \XMM0, \XMM2 878 pshufb %xmm14, \XMM2 # perform a 16 byte swap 879 880 paddd \TMP1, \XMM0 # INCR Y0 881 MOVADQ \XMM0, \XMM3 882 pshufb %xmm14, \XMM3 # perform a 16 byte swap 883 884 paddd \TMP1, \XMM0 # INCR Y0 885 MOVADQ \XMM0, \XMM4 886 pshufb %xmm14, \XMM4 # perform a 16 byte swap 887 888 MOVADQ 0(%arg1),\TMP1 889 pxor \TMP1, \XMM1 890 pxor \TMP1, \XMM2 891 pxor \TMP1, \XMM3 892 pxor \TMP1, \XMM4 893.irpc index, 1234 # do 4 rounds 894 movaps 0x10*\index(%arg1), \TMP1 895 aesenc \TMP1, \XMM1 896 aesenc \TMP1, \XMM2 897 aesenc \TMP1, \XMM3 898 aesenc \TMP1, \XMM4 899.endr 900.irpc index, 56789 # do next 5 rounds 901 movaps 0x10*\index(%arg1), \TMP1 902 aesenc \TMP1, \XMM1 903 aesenc \TMP1, \XMM2 904 aesenc \TMP1, \XMM3 905 aesenc \TMP1, \XMM4 906.endr 907 lea 0xa0(%arg1),%r10 908 mov keysize,%eax 909 shr $2,%eax # 128->4, 192->6, 256->8 910 sub $4,%eax # 128->0, 192->2, 256->4 911 jz aes_loop_pre_done\@ 912 913aes_loop_pre_\@: 914 MOVADQ (%r10),\TMP2 915.irpc index, 1234 916 aesenc \TMP2, %xmm\index 917.endr 918 add $16,%r10 919 sub $1,%eax 920 jnz aes_loop_pre_\@ 921 922aes_loop_pre_done\@: 923 MOVADQ (%r10), \TMP2 924 aesenclast \TMP2, \XMM1 925 aesenclast \TMP2, \XMM2 926 aesenclast \TMP2, \XMM3 927 aesenclast \TMP2, \XMM4 928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 929 pxor \TMP1, \XMM1 930.ifc \operation, dec 931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 932 movdqa \TMP1, \XMM1 933.endif 934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 935 pxor \TMP1, \XMM2 936.ifc \operation, dec 937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 938 movdqa \TMP1, \XMM2 939.endif 940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 941 pxor \TMP1, \XMM3 942.ifc \operation, dec 943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 944 movdqa \TMP1, \XMM3 945.endif 946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 947 pxor \TMP1, \XMM4 948.ifc \operation, dec 949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 950 movdqa \TMP1, \XMM4 951.else 952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 956.endif 957 958 add $64, %r11 959 pshufb %xmm14, \XMM1 # perform a 16 byte swap 960 pxor \XMMDst, \XMM1 961# combine GHASHed value with the corresponding ciphertext 962 pshufb %xmm14, \XMM2 # perform a 16 byte swap 963 pshufb %xmm14, \XMM3 # perform a 16 byte swap 964 pshufb %xmm14, \XMM4 # perform a 16 byte swap 965 966_initial_blocks_done\@: 967 968.endm 969 970/* 971* encrypt 4 blocks at a time 972* ghash the 4 previously encrypted ciphertext blocks 973* arg1, %arg3, %arg4 are used as pointers only, not modified 974* %r11 is the data offset value 975*/ 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 978 979 movdqa \XMM1, \XMM5 980 movdqa \XMM2, \XMM6 981 movdqa \XMM3, \XMM7 982 movdqa \XMM4, \XMM8 983 984 movdqa SHUF_MASK(%rip), %xmm15 985 # multiply TMP5 * HashKey using karatsuba 986 987 movdqa \XMM5, \TMP4 988 pshufd $78, \XMM5, \TMP6 989 pxor \XMM5, \TMP6 990 paddd ONE(%rip), \XMM0 # INCR CNT 991 movdqu HashKey_4(%arg2), \TMP5 992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 993 movdqa \XMM0, \XMM1 994 paddd ONE(%rip), \XMM0 # INCR CNT 995 movdqa \XMM0, \XMM2 996 paddd ONE(%rip), \XMM0 # INCR CNT 997 movdqa \XMM0, \XMM3 998 paddd ONE(%rip), \XMM0 # INCR CNT 999 movdqa \XMM0, \XMM4 1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1005 1006 pxor (%arg1), \XMM1 1007 pxor (%arg1), \XMM2 1008 pxor (%arg1), \XMM3 1009 pxor (%arg1), \XMM4 1010 movdqu HashKey_4_k(%arg2), \TMP5 1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1012 movaps 0x10(%arg1), \TMP1 1013 aesenc \TMP1, \XMM1 # Round 1 1014 aesenc \TMP1, \XMM2 1015 aesenc \TMP1, \XMM3 1016 aesenc \TMP1, \XMM4 1017 movaps 0x20(%arg1), \TMP1 1018 aesenc \TMP1, \XMM1 # Round 2 1019 aesenc \TMP1, \XMM2 1020 aesenc \TMP1, \XMM3 1021 aesenc \TMP1, \XMM4 1022 movdqa \XMM6, \TMP1 1023 pshufd $78, \XMM6, \TMP2 1024 pxor \XMM6, \TMP2 1025 movdqu HashKey_3(%arg2), \TMP5 1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1027 movaps 0x30(%arg1), \TMP3 1028 aesenc \TMP3, \XMM1 # Round 3 1029 aesenc \TMP3, \XMM2 1030 aesenc \TMP3, \XMM3 1031 aesenc \TMP3, \XMM4 1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1033 movaps 0x40(%arg1), \TMP3 1034 aesenc \TMP3, \XMM1 # Round 4 1035 aesenc \TMP3, \XMM2 1036 aesenc \TMP3, \XMM3 1037 aesenc \TMP3, \XMM4 1038 movdqu HashKey_3_k(%arg2), \TMP5 1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1040 movaps 0x50(%arg1), \TMP3 1041 aesenc \TMP3, \XMM1 # Round 5 1042 aesenc \TMP3, \XMM2 1043 aesenc \TMP3, \XMM3 1044 aesenc \TMP3, \XMM4 1045 pxor \TMP1, \TMP4 1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1047 pxor \XMM6, \XMM5 1048 pxor \TMP2, \TMP6 1049 movdqa \XMM7, \TMP1 1050 pshufd $78, \XMM7, \TMP2 1051 pxor \XMM7, \TMP2 1052 movdqu HashKey_2(%arg2), \TMP5 1053 1054 # Multiply TMP5 * HashKey using karatsuba 1055 1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1057 movaps 0x60(%arg1), \TMP3 1058 aesenc \TMP3, \XMM1 # Round 6 1059 aesenc \TMP3, \XMM2 1060 aesenc \TMP3, \XMM3 1061 aesenc \TMP3, \XMM4 1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1063 movaps 0x70(%arg1), \TMP3 1064 aesenc \TMP3, \XMM1 # Round 7 1065 aesenc \TMP3, \XMM2 1066 aesenc \TMP3, \XMM3 1067 aesenc \TMP3, \XMM4 1068 movdqu HashKey_2_k(%arg2), \TMP5 1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1070 movaps 0x80(%arg1), \TMP3 1071 aesenc \TMP3, \XMM1 # Round 8 1072 aesenc \TMP3, \XMM2 1073 aesenc \TMP3, \XMM3 1074 aesenc \TMP3, \XMM4 1075 pxor \TMP1, \TMP4 1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1077 pxor \XMM7, \XMM5 1078 pxor \TMP2, \TMP6 1079 1080 # Multiply XMM8 * HashKey 1081 # XMM8 and TMP5 hold the values for the two operands 1082 1083 movdqa \XMM8, \TMP1 1084 pshufd $78, \XMM8, \TMP2 1085 pxor \XMM8, \TMP2 1086 movdqu HashKey(%arg2), \TMP5 1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1088 movaps 0x90(%arg1), \TMP3 1089 aesenc \TMP3, \XMM1 # Round 9 1090 aesenc \TMP3, \XMM2 1091 aesenc \TMP3, \XMM3 1092 aesenc \TMP3, \XMM4 1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1094 lea 0xa0(%arg1),%r10 1095 mov keysize,%eax 1096 shr $2,%eax # 128->4, 192->6, 256->8 1097 sub $4,%eax # 128->0, 192->2, 256->4 1098 jz aes_loop_par_enc_done\@ 1099 1100aes_loop_par_enc\@: 1101 MOVADQ (%r10),\TMP3 1102.irpc index, 1234 1103 aesenc \TMP3, %xmm\index 1104.endr 1105 add $16,%r10 1106 sub $1,%eax 1107 jnz aes_loop_par_enc\@ 1108 1109aes_loop_par_enc_done\@: 1110 MOVADQ (%r10), \TMP3 1111 aesenclast \TMP3, \XMM1 # Round 10 1112 aesenclast \TMP3, \XMM2 1113 aesenclast \TMP3, \XMM3 1114 aesenclast \TMP3, \XMM4 1115 movdqu HashKey_k(%arg2), \TMP5 1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1117 movdqu (%arg4,%r11,1), \TMP3 1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1119 movdqu 16(%arg4,%r11,1), \TMP3 1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1121 movdqu 32(%arg4,%r11,1), \TMP3 1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1123 movdqu 48(%arg4,%r11,1), \TMP3 1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1133 1134 pxor \TMP4, \TMP1 1135 pxor \XMM8, \XMM5 1136 pxor \TMP6, \TMP2 1137 pxor \TMP1, \TMP2 1138 pxor \XMM5, \TMP2 1139 movdqa \TMP2, \TMP3 1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1142 pxor \TMP3, \XMM5 1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1144 1145 # first phase of reduction 1146 1147 movdqa \XMM5, \TMP2 1148 movdqa \XMM5, \TMP3 1149 movdqa \XMM5, \TMP4 1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1151 pslld $31, \TMP2 # packed right shift << 31 1152 pslld $30, \TMP3 # packed right shift << 30 1153 pslld $25, \TMP4 # packed right shift << 25 1154 pxor \TMP3, \TMP2 # xor the shifted versions 1155 pxor \TMP4, \TMP2 1156 movdqa \TMP2, \TMP5 1157 psrldq $4, \TMP5 # right shift T5 1 DW 1158 pslldq $12, \TMP2 # left shift T2 3 DWs 1159 pxor \TMP2, \XMM5 1160 1161 # second phase of reduction 1162 1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1164 movdqa \XMM5,\TMP3 1165 movdqa \XMM5,\TMP4 1166 psrld $1, \TMP2 # packed left shift >>1 1167 psrld $2, \TMP3 # packed left shift >>2 1168 psrld $7, \TMP4 # packed left shift >>7 1169 pxor \TMP3,\TMP2 # xor the shifted versions 1170 pxor \TMP4,\TMP2 1171 pxor \TMP5, \TMP2 1172 pxor \TMP2, \XMM5 1173 pxor \TMP1, \XMM5 # result is in TMP1 1174 1175 pxor \XMM5, \XMM1 1176.endm 1177 1178/* 1179* decrypt 4 blocks at a time 1180* ghash the 4 previously decrypted ciphertext blocks 1181* arg1, %arg3, %arg4 are used as pointers only, not modified 1182* %r11 is the data offset value 1183*/ 1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1186 1187 movdqa \XMM1, \XMM5 1188 movdqa \XMM2, \XMM6 1189 movdqa \XMM3, \XMM7 1190 movdqa \XMM4, \XMM8 1191 1192 movdqa SHUF_MASK(%rip), %xmm15 1193 # multiply TMP5 * HashKey using karatsuba 1194 1195 movdqa \XMM5, \TMP4 1196 pshufd $78, \XMM5, \TMP6 1197 pxor \XMM5, \TMP6 1198 paddd ONE(%rip), \XMM0 # INCR CNT 1199 movdqu HashKey_4(%arg2), \TMP5 1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1201 movdqa \XMM0, \XMM1 1202 paddd ONE(%rip), \XMM0 # INCR CNT 1203 movdqa \XMM0, \XMM2 1204 paddd ONE(%rip), \XMM0 # INCR CNT 1205 movdqa \XMM0, \XMM3 1206 paddd ONE(%rip), \XMM0 # INCR CNT 1207 movdqa \XMM0, \XMM4 1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1213 1214 pxor (%arg1), \XMM1 1215 pxor (%arg1), \XMM2 1216 pxor (%arg1), \XMM3 1217 pxor (%arg1), \XMM4 1218 movdqu HashKey_4_k(%arg2), \TMP5 1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1220 movaps 0x10(%arg1), \TMP1 1221 aesenc \TMP1, \XMM1 # Round 1 1222 aesenc \TMP1, \XMM2 1223 aesenc \TMP1, \XMM3 1224 aesenc \TMP1, \XMM4 1225 movaps 0x20(%arg1), \TMP1 1226 aesenc \TMP1, \XMM1 # Round 2 1227 aesenc \TMP1, \XMM2 1228 aesenc \TMP1, \XMM3 1229 aesenc \TMP1, \XMM4 1230 movdqa \XMM6, \TMP1 1231 pshufd $78, \XMM6, \TMP2 1232 pxor \XMM6, \TMP2 1233 movdqu HashKey_3(%arg2), \TMP5 1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1235 movaps 0x30(%arg1), \TMP3 1236 aesenc \TMP3, \XMM1 # Round 3 1237 aesenc \TMP3, \XMM2 1238 aesenc \TMP3, \XMM3 1239 aesenc \TMP3, \XMM4 1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1241 movaps 0x40(%arg1), \TMP3 1242 aesenc \TMP3, \XMM1 # Round 4 1243 aesenc \TMP3, \XMM2 1244 aesenc \TMP3, \XMM3 1245 aesenc \TMP3, \XMM4 1246 movdqu HashKey_3_k(%arg2), \TMP5 1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1248 movaps 0x50(%arg1), \TMP3 1249 aesenc \TMP3, \XMM1 # Round 5 1250 aesenc \TMP3, \XMM2 1251 aesenc \TMP3, \XMM3 1252 aesenc \TMP3, \XMM4 1253 pxor \TMP1, \TMP4 1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1255 pxor \XMM6, \XMM5 1256 pxor \TMP2, \TMP6 1257 movdqa \XMM7, \TMP1 1258 pshufd $78, \XMM7, \TMP2 1259 pxor \XMM7, \TMP2 1260 movdqu HashKey_2(%arg2), \TMP5 1261 1262 # Multiply TMP5 * HashKey using karatsuba 1263 1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1265 movaps 0x60(%arg1), \TMP3 1266 aesenc \TMP3, \XMM1 # Round 6 1267 aesenc \TMP3, \XMM2 1268 aesenc \TMP3, \XMM3 1269 aesenc \TMP3, \XMM4 1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1271 movaps 0x70(%arg1), \TMP3 1272 aesenc \TMP3, \XMM1 # Round 7 1273 aesenc \TMP3, \XMM2 1274 aesenc \TMP3, \XMM3 1275 aesenc \TMP3, \XMM4 1276 movdqu HashKey_2_k(%arg2), \TMP5 1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1278 movaps 0x80(%arg1), \TMP3 1279 aesenc \TMP3, \XMM1 # Round 8 1280 aesenc \TMP3, \XMM2 1281 aesenc \TMP3, \XMM3 1282 aesenc \TMP3, \XMM4 1283 pxor \TMP1, \TMP4 1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1285 pxor \XMM7, \XMM5 1286 pxor \TMP2, \TMP6 1287 1288 # Multiply XMM8 * HashKey 1289 # XMM8 and TMP5 hold the values for the two operands 1290 1291 movdqa \XMM8, \TMP1 1292 pshufd $78, \XMM8, \TMP2 1293 pxor \XMM8, \TMP2 1294 movdqu HashKey(%arg2), \TMP5 1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1296 movaps 0x90(%arg1), \TMP3 1297 aesenc \TMP3, \XMM1 # Round 9 1298 aesenc \TMP3, \XMM2 1299 aesenc \TMP3, \XMM3 1300 aesenc \TMP3, \XMM4 1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1302 lea 0xa0(%arg1),%r10 1303 mov keysize,%eax 1304 shr $2,%eax # 128->4, 192->6, 256->8 1305 sub $4,%eax # 128->0, 192->2, 256->4 1306 jz aes_loop_par_dec_done\@ 1307 1308aes_loop_par_dec\@: 1309 MOVADQ (%r10),\TMP3 1310.irpc index, 1234 1311 aesenc \TMP3, %xmm\index 1312.endr 1313 add $16,%r10 1314 sub $1,%eax 1315 jnz aes_loop_par_dec\@ 1316 1317aes_loop_par_dec_done\@: 1318 MOVADQ (%r10), \TMP3 1319 aesenclast \TMP3, \XMM1 # last round 1320 aesenclast \TMP3, \XMM2 1321 aesenclast \TMP3, \XMM3 1322 aesenclast \TMP3, \XMM4 1323 movdqu HashKey_k(%arg2), \TMP5 1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1325 movdqu (%arg4,%r11,1), \TMP3 1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1328 movdqa \TMP3, \XMM1 1329 movdqu 16(%arg4,%r11,1), \TMP3 1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1332 movdqa \TMP3, \XMM2 1333 movdqu 32(%arg4,%r11,1), \TMP3 1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1336 movdqa \TMP3, \XMM3 1337 movdqu 48(%arg4,%r11,1), \TMP3 1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1340 movdqa \TMP3, \XMM4 1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1345 1346 pxor \TMP4, \TMP1 1347 pxor \XMM8, \XMM5 1348 pxor \TMP6, \TMP2 1349 pxor \TMP1, \TMP2 1350 pxor \XMM5, \TMP2 1351 movdqa \TMP2, \TMP3 1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1354 pxor \TMP3, \XMM5 1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1356 1357 # first phase of reduction 1358 1359 movdqa \XMM5, \TMP2 1360 movdqa \XMM5, \TMP3 1361 movdqa \XMM5, \TMP4 1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1363 pslld $31, \TMP2 # packed right shift << 31 1364 pslld $30, \TMP3 # packed right shift << 30 1365 pslld $25, \TMP4 # packed right shift << 25 1366 pxor \TMP3, \TMP2 # xor the shifted versions 1367 pxor \TMP4, \TMP2 1368 movdqa \TMP2, \TMP5 1369 psrldq $4, \TMP5 # right shift T5 1 DW 1370 pslldq $12, \TMP2 # left shift T2 3 DWs 1371 pxor \TMP2, \XMM5 1372 1373 # second phase of reduction 1374 1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1376 movdqa \XMM5,\TMP3 1377 movdqa \XMM5,\TMP4 1378 psrld $1, \TMP2 # packed left shift >>1 1379 psrld $2, \TMP3 # packed left shift >>2 1380 psrld $7, \TMP4 # packed left shift >>7 1381 pxor \TMP3,\TMP2 # xor the shifted versions 1382 pxor \TMP4,\TMP2 1383 pxor \TMP5, \TMP2 1384 pxor \TMP2, \XMM5 1385 pxor \TMP1, \XMM5 # result is in TMP1 1386 1387 pxor \XMM5, \XMM1 1388.endm 1389 1390/* GHASH the last 4 ciphertext blocks. */ 1391.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1393 1394 # Multiply TMP6 * HashKey (using Karatsuba) 1395 1396 movdqa \XMM1, \TMP6 1397 pshufd $78, \XMM1, \TMP2 1398 pxor \XMM1, \TMP2 1399 movdqu HashKey_4(%arg2), \TMP5 1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1402 movdqu HashKey_4_k(%arg2), \TMP4 1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1404 movdqa \XMM1, \XMMDst 1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1406 1407 # Multiply TMP1 * HashKey (using Karatsuba) 1408 1409 movdqa \XMM2, \TMP1 1410 pshufd $78, \XMM2, \TMP2 1411 pxor \XMM2, \TMP2 1412 movdqu HashKey_3(%arg2), \TMP5 1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1415 movdqu HashKey_3_k(%arg2), \TMP4 1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1417 pxor \TMP1, \TMP6 1418 pxor \XMM2, \XMMDst 1419 pxor \TMP2, \XMM1 1420# results accumulated in TMP6, XMMDst, XMM1 1421 1422 # Multiply TMP1 * HashKey (using Karatsuba) 1423 1424 movdqa \XMM3, \TMP1 1425 pshufd $78, \XMM3, \TMP2 1426 pxor \XMM3, \TMP2 1427 movdqu HashKey_2(%arg2), \TMP5 1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1430 movdqu HashKey_2_k(%arg2), \TMP4 1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1432 pxor \TMP1, \TMP6 1433 pxor \XMM3, \XMMDst 1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1435 1436 # Multiply TMP1 * HashKey (using Karatsuba) 1437 movdqa \XMM4, \TMP1 1438 pshufd $78, \XMM4, \TMP2 1439 pxor \XMM4, \TMP2 1440 movdqu HashKey(%arg2), \TMP5 1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1443 movdqu HashKey_k(%arg2), \TMP4 1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1445 pxor \TMP1, \TMP6 1446 pxor \XMM4, \XMMDst 1447 pxor \XMM1, \TMP2 1448 pxor \TMP6, \TMP2 1449 pxor \XMMDst, \TMP2 1450 # middle section of the temp results combined as in karatsuba algorithm 1451 movdqa \TMP2, \TMP4 1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1454 pxor \TMP4, \XMMDst 1455 pxor \TMP2, \TMP6 1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1457 # first phase of the reduction 1458 movdqa \XMMDst, \TMP2 1459 movdqa \XMMDst, \TMP3 1460 movdqa \XMMDst, \TMP4 1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1462 pslld $31, \TMP2 # packed right shifting << 31 1463 pslld $30, \TMP3 # packed right shifting << 30 1464 pslld $25, \TMP4 # packed right shifting << 25 1465 pxor \TMP3, \TMP2 # xor the shifted versions 1466 pxor \TMP4, \TMP2 1467 movdqa \TMP2, \TMP7 1468 psrldq $4, \TMP7 # right shift TMP7 1 DW 1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1470 pxor \TMP2, \XMMDst 1471 1472 # second phase of the reduction 1473 movdqa \XMMDst, \TMP2 1474 # make 3 copies of XMMDst for doing 3 shift operations 1475 movdqa \XMMDst, \TMP3 1476 movdqa \XMMDst, \TMP4 1477 psrld $1, \TMP2 # packed left shift >> 1 1478 psrld $2, \TMP3 # packed left shift >> 2 1479 psrld $7, \TMP4 # packed left shift >> 7 1480 pxor \TMP3, \TMP2 # xor the shifted versions 1481 pxor \TMP4, \TMP2 1482 pxor \TMP7, \TMP2 1483 pxor \TMP2, \XMMDst 1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1485.endm 1486 1487 1488/* Encryption of a single block 1489* uses eax & r10 1490*/ 1491 1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1493 1494 pxor (%arg1), \XMM0 1495 mov keysize,%eax 1496 shr $2,%eax # 128->4, 192->6, 256->8 1497 add $5,%eax # 128->9, 192->11, 256->13 1498 lea 16(%arg1), %r10 # get first expanded key address 1499 1500_esb_loop_\@: 1501 MOVADQ (%r10),\TMP1 1502 aesenc \TMP1,\XMM0 1503 add $16,%r10 1504 sub $1,%eax 1505 jnz _esb_loop_\@ 1506 1507 MOVADQ (%r10),\TMP1 1508 aesenclast \TMP1,\XMM0 1509.endm 1510/***************************************************************************** 1511* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1512* struct gcm_context_data *data 1513* // Context data 1514* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1515* const u8 *in, // Ciphertext input 1516* u64 plaintext_len, // Length of data in bytes for decryption. 1517* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1518* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1519* // concatenated with 0x00000001. 16-byte aligned pointer. 1520* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1521* const u8 *aad, // Additional Authentication Data (AAD) 1522* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1523* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1524* // given authentication tag and only return the plaintext if they match. 1525* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1526* // (most likely), 12 or 8. 1527* 1528* Assumptions: 1529* 1530* keys: 1531* keys are pre-expanded and aligned to 16 bytes. we are using the first 1532* set of 11 keys in the data structure void *aes_ctx 1533* 1534* iv: 1535* 0 1 2 3 1536* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1537* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1538* | Salt (From the SA) | 1539* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1540* | Initialization Vector | 1541* | (This is the sequence number from IPSec header) | 1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1543* | 0x1 | 1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1545* 1546* 1547* 1548* AAD: 1549* AAD padded to 128 bits with 0 1550* for example, assume AAD is a u32 vector 1551* 1552* if AAD is 8 bytes: 1553* AAD[3] = {A0, A1}; 1554* padded AAD in xmm register = {A1 A0 0 0} 1555* 1556* 0 1 2 3 1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1559* | SPI (A1) | 1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1561* | 32-bit Sequence Number (A0) | 1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1563* | 0x0 | 1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1565* 1566* AAD Format with 32-bit Sequence Number 1567* 1568* if AAD is 12 bytes: 1569* AAD[3] = {A0, A1, A2}; 1570* padded AAD in xmm register = {A2 A1 A0 0} 1571* 1572* 0 1 2 3 1573* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1574* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1575* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1576* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1577* | SPI (A2) | 1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1579* | 64-bit Extended Sequence Number {A1,A0} | 1580* | | 1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1582* | 0x0 | 1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1584* 1585* AAD Format with 64-bit Extended Sequence Number 1586* 1587* poly = x^128 + x^127 + x^126 + x^121 + 1 1588* 1589*****************************************************************************/ 1590SYM_FUNC_START(aesni_gcm_dec) 1591 FUNC_SAVE 1592 1593 GCM_INIT %arg6, arg7, arg8, arg9 1594 GCM_ENC_DEC dec 1595 GCM_COMPLETE arg10, arg11 1596 FUNC_RESTORE 1597 ret 1598SYM_FUNC_END(aesni_gcm_dec) 1599 1600 1601/***************************************************************************** 1602* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1603* struct gcm_context_data *data 1604* // Context data 1605* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1606* const u8 *in, // Plaintext input 1607* u64 plaintext_len, // Length of data in bytes for encryption. 1608* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1609* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1610* // concatenated with 0x00000001. 16-byte aligned pointer. 1611* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1612* const u8 *aad, // Additional Authentication Data (AAD) 1613* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1614* u8 *auth_tag, // Authenticated Tag output. 1615* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1616* // 12 or 8. 1617* 1618* Assumptions: 1619* 1620* keys: 1621* keys are pre-expanded and aligned to 16 bytes. we are using the 1622* first set of 11 keys in the data structure void *aes_ctx 1623* 1624* 1625* iv: 1626* 0 1 2 3 1627* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1628* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1629* | Salt (From the SA) | 1630* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1631* | Initialization Vector | 1632* | (This is the sequence number from IPSec header) | 1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1634* | 0x1 | 1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1636* 1637* 1638* 1639* AAD: 1640* AAD padded to 128 bits with 0 1641* for example, assume AAD is a u32 vector 1642* 1643* if AAD is 8 bytes: 1644* AAD[3] = {A0, A1}; 1645* padded AAD in xmm register = {A1 A0 0 0} 1646* 1647* 0 1 2 3 1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1650* | SPI (A1) | 1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1652* | 32-bit Sequence Number (A0) | 1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1654* | 0x0 | 1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1656* 1657* AAD Format with 32-bit Sequence Number 1658* 1659* if AAD is 12 bytes: 1660* AAD[3] = {A0, A1, A2}; 1661* padded AAD in xmm register = {A2 A1 A0 0} 1662* 1663* 0 1 2 3 1664* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1665* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1666* | SPI (A2) | 1667* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1668* | 64-bit Extended Sequence Number {A1,A0} | 1669* | | 1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1671* | 0x0 | 1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1673* 1674* AAD Format with 64-bit Extended Sequence Number 1675* 1676* poly = x^128 + x^127 + x^126 + x^121 + 1 1677***************************************************************************/ 1678SYM_FUNC_START(aesni_gcm_enc) 1679 FUNC_SAVE 1680 1681 GCM_INIT %arg6, arg7, arg8, arg9 1682 GCM_ENC_DEC enc 1683 1684 GCM_COMPLETE arg10, arg11 1685 FUNC_RESTORE 1686 ret 1687SYM_FUNC_END(aesni_gcm_enc) 1688 1689/***************************************************************************** 1690* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1691* struct gcm_context_data *data, 1692* // context data 1693* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1694* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1695* // concatenated with 0x00000001. 16-byte aligned pointer. 1696* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1697* const u8 *aad, // Additional Authentication Data (AAD) 1698* u64 aad_len) // Length of AAD in bytes. 1699*/ 1700SYM_FUNC_START(aesni_gcm_init) 1701 FUNC_SAVE 1702 GCM_INIT %arg3, %arg4,%arg5, %arg6 1703 FUNC_RESTORE 1704 ret 1705SYM_FUNC_END(aesni_gcm_init) 1706 1707/***************************************************************************** 1708* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1709* struct gcm_context_data *data, 1710* // context data 1711* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1712* const u8 *in, // Plaintext input 1713* u64 plaintext_len, // Length of data in bytes for encryption. 1714*/ 1715SYM_FUNC_START(aesni_gcm_enc_update) 1716 FUNC_SAVE 1717 GCM_ENC_DEC enc 1718 FUNC_RESTORE 1719 ret 1720SYM_FUNC_END(aesni_gcm_enc_update) 1721 1722/***************************************************************************** 1723* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1724* struct gcm_context_data *data, 1725* // context data 1726* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1727* const u8 *in, // Plaintext input 1728* u64 plaintext_len, // Length of data in bytes for encryption. 1729*/ 1730SYM_FUNC_START(aesni_gcm_dec_update) 1731 FUNC_SAVE 1732 GCM_ENC_DEC dec 1733 FUNC_RESTORE 1734 ret 1735SYM_FUNC_END(aesni_gcm_dec_update) 1736 1737/***************************************************************************** 1738* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1739* struct gcm_context_data *data, 1740* // context data 1741* u8 *auth_tag, // Authenticated Tag output. 1742* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1743* // 12 or 8. 1744*/ 1745SYM_FUNC_START(aesni_gcm_finalize) 1746 FUNC_SAVE 1747 GCM_COMPLETE %arg3 %arg4 1748 FUNC_RESTORE 1749 ret 1750SYM_FUNC_END(aesni_gcm_finalize) 1751 1752#endif 1753 1754 1755SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) 1756SYM_FUNC_START_LOCAL(_key_expansion_256a) 1757 pshufd $0b11111111, %xmm1, %xmm1 1758 shufps $0b00010000, %xmm0, %xmm4 1759 pxor %xmm4, %xmm0 1760 shufps $0b10001100, %xmm0, %xmm4 1761 pxor %xmm4, %xmm0 1762 pxor %xmm1, %xmm0 1763 movaps %xmm0, (TKEYP) 1764 add $0x10, TKEYP 1765 ret 1766SYM_FUNC_END(_key_expansion_256a) 1767SYM_FUNC_END_ALIAS(_key_expansion_128) 1768 1769SYM_FUNC_START_LOCAL(_key_expansion_192a) 1770 pshufd $0b01010101, %xmm1, %xmm1 1771 shufps $0b00010000, %xmm0, %xmm4 1772 pxor %xmm4, %xmm0 1773 shufps $0b10001100, %xmm0, %xmm4 1774 pxor %xmm4, %xmm0 1775 pxor %xmm1, %xmm0 1776 1777 movaps %xmm2, %xmm5 1778 movaps %xmm2, %xmm6 1779 pslldq $4, %xmm5 1780 pshufd $0b11111111, %xmm0, %xmm3 1781 pxor %xmm3, %xmm2 1782 pxor %xmm5, %xmm2 1783 1784 movaps %xmm0, %xmm1 1785 shufps $0b01000100, %xmm0, %xmm6 1786 movaps %xmm6, (TKEYP) 1787 shufps $0b01001110, %xmm2, %xmm1 1788 movaps %xmm1, 0x10(TKEYP) 1789 add $0x20, TKEYP 1790 ret 1791SYM_FUNC_END(_key_expansion_192a) 1792 1793SYM_FUNC_START_LOCAL(_key_expansion_192b) 1794 pshufd $0b01010101, %xmm1, %xmm1 1795 shufps $0b00010000, %xmm0, %xmm4 1796 pxor %xmm4, %xmm0 1797 shufps $0b10001100, %xmm0, %xmm4 1798 pxor %xmm4, %xmm0 1799 pxor %xmm1, %xmm0 1800 1801 movaps %xmm2, %xmm5 1802 pslldq $4, %xmm5 1803 pshufd $0b11111111, %xmm0, %xmm3 1804 pxor %xmm3, %xmm2 1805 pxor %xmm5, %xmm2 1806 1807 movaps %xmm0, (TKEYP) 1808 add $0x10, TKEYP 1809 ret 1810SYM_FUNC_END(_key_expansion_192b) 1811 1812SYM_FUNC_START_LOCAL(_key_expansion_256b) 1813 pshufd $0b10101010, %xmm1, %xmm1 1814 shufps $0b00010000, %xmm2, %xmm4 1815 pxor %xmm4, %xmm2 1816 shufps $0b10001100, %xmm2, %xmm4 1817 pxor %xmm4, %xmm2 1818 pxor %xmm1, %xmm2 1819 movaps %xmm2, (TKEYP) 1820 add $0x10, TKEYP 1821 ret 1822SYM_FUNC_END(_key_expansion_256b) 1823 1824/* 1825 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1826 * unsigned int key_len) 1827 */ 1828SYM_FUNC_START(aesni_set_key) 1829 FRAME_BEGIN 1830#ifndef __x86_64__ 1831 pushl KEYP 1832 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1833 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1834 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1835#endif 1836 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1837 movaps %xmm0, (KEYP) 1838 lea 0x10(KEYP), TKEYP # key addr 1839 movl %edx, 480(KEYP) 1840 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1841 cmp $24, %dl 1842 jb .Lenc_key128 1843 je .Lenc_key192 1844 movups 0x10(UKEYP), %xmm2 # other user key 1845 movaps %xmm2, (TKEYP) 1846 add $0x10, TKEYP 1847 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1848 call _key_expansion_256a 1849 aeskeygenassist $0x1, %xmm0, %xmm1 1850 call _key_expansion_256b 1851 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1852 call _key_expansion_256a 1853 aeskeygenassist $0x2, %xmm0, %xmm1 1854 call _key_expansion_256b 1855 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1856 call _key_expansion_256a 1857 aeskeygenassist $0x4, %xmm0, %xmm1 1858 call _key_expansion_256b 1859 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1860 call _key_expansion_256a 1861 aeskeygenassist $0x8, %xmm0, %xmm1 1862 call _key_expansion_256b 1863 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1864 call _key_expansion_256a 1865 aeskeygenassist $0x10, %xmm0, %xmm1 1866 call _key_expansion_256b 1867 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1868 call _key_expansion_256a 1869 aeskeygenassist $0x20, %xmm0, %xmm1 1870 call _key_expansion_256b 1871 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1872 call _key_expansion_256a 1873 jmp .Ldec_key 1874.Lenc_key192: 1875 movq 0x10(UKEYP), %xmm2 # other user key 1876 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1877 call _key_expansion_192a 1878 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1879 call _key_expansion_192b 1880 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1881 call _key_expansion_192a 1882 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1883 call _key_expansion_192b 1884 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1885 call _key_expansion_192a 1886 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1887 call _key_expansion_192b 1888 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1889 call _key_expansion_192a 1890 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 1891 call _key_expansion_192b 1892 jmp .Ldec_key 1893.Lenc_key128: 1894 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 1895 call _key_expansion_128 1896 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 1897 call _key_expansion_128 1898 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 1899 call _key_expansion_128 1900 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 1901 call _key_expansion_128 1902 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 1903 call _key_expansion_128 1904 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 1905 call _key_expansion_128 1906 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 1907 call _key_expansion_128 1908 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 1909 call _key_expansion_128 1910 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 1911 call _key_expansion_128 1912 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 1913 call _key_expansion_128 1914.Ldec_key: 1915 sub $0x10, TKEYP 1916 movaps (KEYP), %xmm0 1917 movaps (TKEYP), %xmm1 1918 movaps %xmm0, 240(TKEYP) 1919 movaps %xmm1, 240(KEYP) 1920 add $0x10, KEYP 1921 lea 240-16(TKEYP), UKEYP 1922.align 4 1923.Ldec_key_loop: 1924 movaps (KEYP), %xmm0 1925 aesimc %xmm0, %xmm1 1926 movaps %xmm1, (UKEYP) 1927 add $0x10, KEYP 1928 sub $0x10, UKEYP 1929 cmp TKEYP, KEYP 1930 jb .Ldec_key_loop 1931 xor AREG, AREG 1932#ifndef __x86_64__ 1933 popl KEYP 1934#endif 1935 FRAME_END 1936 ret 1937SYM_FUNC_END(aesni_set_key) 1938 1939/* 1940 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1941 */ 1942SYM_FUNC_START(aesni_enc) 1943 FRAME_BEGIN 1944#ifndef __x86_64__ 1945 pushl KEYP 1946 pushl KLEN 1947 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1948 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1949 movl (FRAME_OFFSET+20)(%esp), INP # src 1950#endif 1951 movl 480(KEYP), KLEN # key length 1952 movups (INP), STATE # input 1953 call _aesni_enc1 1954 movups STATE, (OUTP) # output 1955#ifndef __x86_64__ 1956 popl KLEN 1957 popl KEYP 1958#endif 1959 FRAME_END 1960 ret 1961SYM_FUNC_END(aesni_enc) 1962 1963/* 1964 * _aesni_enc1: internal ABI 1965 * input: 1966 * KEYP: key struct pointer 1967 * KLEN: round count 1968 * STATE: initial state (input) 1969 * output: 1970 * STATE: finial state (output) 1971 * changed: 1972 * KEY 1973 * TKEYP (T1) 1974 */ 1975SYM_FUNC_START_LOCAL(_aesni_enc1) 1976 movaps (KEYP), KEY # key 1977 mov KEYP, TKEYP 1978 pxor KEY, STATE # round 0 1979 add $0x30, TKEYP 1980 cmp $24, KLEN 1981 jb .Lenc128 1982 lea 0x20(TKEYP), TKEYP 1983 je .Lenc192 1984 add $0x20, TKEYP 1985 movaps -0x60(TKEYP), KEY 1986 aesenc KEY, STATE 1987 movaps -0x50(TKEYP), KEY 1988 aesenc KEY, STATE 1989.align 4 1990.Lenc192: 1991 movaps -0x40(TKEYP), KEY 1992 aesenc KEY, STATE 1993 movaps -0x30(TKEYP), KEY 1994 aesenc KEY, STATE 1995.align 4 1996.Lenc128: 1997 movaps -0x20(TKEYP), KEY 1998 aesenc KEY, STATE 1999 movaps -0x10(TKEYP), KEY 2000 aesenc KEY, STATE 2001 movaps (TKEYP), KEY 2002 aesenc KEY, STATE 2003 movaps 0x10(TKEYP), KEY 2004 aesenc KEY, STATE 2005 movaps 0x20(TKEYP), KEY 2006 aesenc KEY, STATE 2007 movaps 0x30(TKEYP), KEY 2008 aesenc KEY, STATE 2009 movaps 0x40(TKEYP), KEY 2010 aesenc KEY, STATE 2011 movaps 0x50(TKEYP), KEY 2012 aesenc KEY, STATE 2013 movaps 0x60(TKEYP), KEY 2014 aesenc KEY, STATE 2015 movaps 0x70(TKEYP), KEY 2016 aesenclast KEY, STATE 2017 ret 2018SYM_FUNC_END(_aesni_enc1) 2019 2020/* 2021 * _aesni_enc4: internal ABI 2022 * input: 2023 * KEYP: key struct pointer 2024 * KLEN: round count 2025 * STATE1: initial state (input) 2026 * STATE2 2027 * STATE3 2028 * STATE4 2029 * output: 2030 * STATE1: finial state (output) 2031 * STATE2 2032 * STATE3 2033 * STATE4 2034 * changed: 2035 * KEY 2036 * TKEYP (T1) 2037 */ 2038SYM_FUNC_START_LOCAL(_aesni_enc4) 2039 movaps (KEYP), KEY # key 2040 mov KEYP, TKEYP 2041 pxor KEY, STATE1 # round 0 2042 pxor KEY, STATE2 2043 pxor KEY, STATE3 2044 pxor KEY, STATE4 2045 add $0x30, TKEYP 2046 cmp $24, KLEN 2047 jb .L4enc128 2048 lea 0x20(TKEYP), TKEYP 2049 je .L4enc192 2050 add $0x20, TKEYP 2051 movaps -0x60(TKEYP), KEY 2052 aesenc KEY, STATE1 2053 aesenc KEY, STATE2 2054 aesenc KEY, STATE3 2055 aesenc KEY, STATE4 2056 movaps -0x50(TKEYP), KEY 2057 aesenc KEY, STATE1 2058 aesenc KEY, STATE2 2059 aesenc KEY, STATE3 2060 aesenc KEY, STATE4 2061#.align 4 2062.L4enc192: 2063 movaps -0x40(TKEYP), KEY 2064 aesenc KEY, STATE1 2065 aesenc KEY, STATE2 2066 aesenc KEY, STATE3 2067 aesenc KEY, STATE4 2068 movaps -0x30(TKEYP), KEY 2069 aesenc KEY, STATE1 2070 aesenc KEY, STATE2 2071 aesenc KEY, STATE3 2072 aesenc KEY, STATE4 2073#.align 4 2074.L4enc128: 2075 movaps -0x20(TKEYP), KEY 2076 aesenc KEY, STATE1 2077 aesenc KEY, STATE2 2078 aesenc KEY, STATE3 2079 aesenc KEY, STATE4 2080 movaps -0x10(TKEYP), KEY 2081 aesenc KEY, STATE1 2082 aesenc KEY, STATE2 2083 aesenc KEY, STATE3 2084 aesenc KEY, STATE4 2085 movaps (TKEYP), KEY 2086 aesenc KEY, STATE1 2087 aesenc KEY, STATE2 2088 aesenc KEY, STATE3 2089 aesenc KEY, STATE4 2090 movaps 0x10(TKEYP), KEY 2091 aesenc KEY, STATE1 2092 aesenc KEY, STATE2 2093 aesenc KEY, STATE3 2094 aesenc KEY, STATE4 2095 movaps 0x20(TKEYP), KEY 2096 aesenc KEY, STATE1 2097 aesenc KEY, STATE2 2098 aesenc KEY, STATE3 2099 aesenc KEY, STATE4 2100 movaps 0x30(TKEYP), KEY 2101 aesenc KEY, STATE1 2102 aesenc KEY, STATE2 2103 aesenc KEY, STATE3 2104 aesenc KEY, STATE4 2105 movaps 0x40(TKEYP), KEY 2106 aesenc KEY, STATE1 2107 aesenc KEY, STATE2 2108 aesenc KEY, STATE3 2109 aesenc KEY, STATE4 2110 movaps 0x50(TKEYP), KEY 2111 aesenc KEY, STATE1 2112 aesenc KEY, STATE2 2113 aesenc KEY, STATE3 2114 aesenc KEY, STATE4 2115 movaps 0x60(TKEYP), KEY 2116 aesenc KEY, STATE1 2117 aesenc KEY, STATE2 2118 aesenc KEY, STATE3 2119 aesenc KEY, STATE4 2120 movaps 0x70(TKEYP), KEY 2121 aesenclast KEY, STATE1 # last round 2122 aesenclast KEY, STATE2 2123 aesenclast KEY, STATE3 2124 aesenclast KEY, STATE4 2125 ret 2126SYM_FUNC_END(_aesni_enc4) 2127 2128/* 2129 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 2130 */ 2131SYM_FUNC_START(aesni_dec) 2132 FRAME_BEGIN 2133#ifndef __x86_64__ 2134 pushl KEYP 2135 pushl KLEN 2136 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2137 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2138 movl (FRAME_OFFSET+20)(%esp), INP # src 2139#endif 2140 mov 480(KEYP), KLEN # key length 2141 add $240, KEYP 2142 movups (INP), STATE # input 2143 call _aesni_dec1 2144 movups STATE, (OUTP) #output 2145#ifndef __x86_64__ 2146 popl KLEN 2147 popl KEYP 2148#endif 2149 FRAME_END 2150 ret 2151SYM_FUNC_END(aesni_dec) 2152 2153/* 2154 * _aesni_dec1: internal ABI 2155 * input: 2156 * KEYP: key struct pointer 2157 * KLEN: key length 2158 * STATE: initial state (input) 2159 * output: 2160 * STATE: finial state (output) 2161 * changed: 2162 * KEY 2163 * TKEYP (T1) 2164 */ 2165SYM_FUNC_START_LOCAL(_aesni_dec1) 2166 movaps (KEYP), KEY # key 2167 mov KEYP, TKEYP 2168 pxor KEY, STATE # round 0 2169 add $0x30, TKEYP 2170 cmp $24, KLEN 2171 jb .Ldec128 2172 lea 0x20(TKEYP), TKEYP 2173 je .Ldec192 2174 add $0x20, TKEYP 2175 movaps -0x60(TKEYP), KEY 2176 aesdec KEY, STATE 2177 movaps -0x50(TKEYP), KEY 2178 aesdec KEY, STATE 2179.align 4 2180.Ldec192: 2181 movaps -0x40(TKEYP), KEY 2182 aesdec KEY, STATE 2183 movaps -0x30(TKEYP), KEY 2184 aesdec KEY, STATE 2185.align 4 2186.Ldec128: 2187 movaps -0x20(TKEYP), KEY 2188 aesdec KEY, STATE 2189 movaps -0x10(TKEYP), KEY 2190 aesdec KEY, STATE 2191 movaps (TKEYP), KEY 2192 aesdec KEY, STATE 2193 movaps 0x10(TKEYP), KEY 2194 aesdec KEY, STATE 2195 movaps 0x20(TKEYP), KEY 2196 aesdec KEY, STATE 2197 movaps 0x30(TKEYP), KEY 2198 aesdec KEY, STATE 2199 movaps 0x40(TKEYP), KEY 2200 aesdec KEY, STATE 2201 movaps 0x50(TKEYP), KEY 2202 aesdec KEY, STATE 2203 movaps 0x60(TKEYP), KEY 2204 aesdec KEY, STATE 2205 movaps 0x70(TKEYP), KEY 2206 aesdeclast KEY, STATE 2207 ret 2208SYM_FUNC_END(_aesni_dec1) 2209 2210/* 2211 * _aesni_dec4: internal ABI 2212 * input: 2213 * KEYP: key struct pointer 2214 * KLEN: key length 2215 * STATE1: initial state (input) 2216 * STATE2 2217 * STATE3 2218 * STATE4 2219 * output: 2220 * STATE1: finial state (output) 2221 * STATE2 2222 * STATE3 2223 * STATE4 2224 * changed: 2225 * KEY 2226 * TKEYP (T1) 2227 */ 2228SYM_FUNC_START_LOCAL(_aesni_dec4) 2229 movaps (KEYP), KEY # key 2230 mov KEYP, TKEYP 2231 pxor KEY, STATE1 # round 0 2232 pxor KEY, STATE2 2233 pxor KEY, STATE3 2234 pxor KEY, STATE4 2235 add $0x30, TKEYP 2236 cmp $24, KLEN 2237 jb .L4dec128 2238 lea 0x20(TKEYP), TKEYP 2239 je .L4dec192 2240 add $0x20, TKEYP 2241 movaps -0x60(TKEYP), KEY 2242 aesdec KEY, STATE1 2243 aesdec KEY, STATE2 2244 aesdec KEY, STATE3 2245 aesdec KEY, STATE4 2246 movaps -0x50(TKEYP), KEY 2247 aesdec KEY, STATE1 2248 aesdec KEY, STATE2 2249 aesdec KEY, STATE3 2250 aesdec KEY, STATE4 2251.align 4 2252.L4dec192: 2253 movaps -0x40(TKEYP), KEY 2254 aesdec KEY, STATE1 2255 aesdec KEY, STATE2 2256 aesdec KEY, STATE3 2257 aesdec KEY, STATE4 2258 movaps -0x30(TKEYP), KEY 2259 aesdec KEY, STATE1 2260 aesdec KEY, STATE2 2261 aesdec KEY, STATE3 2262 aesdec KEY, STATE4 2263.align 4 2264.L4dec128: 2265 movaps -0x20(TKEYP), KEY 2266 aesdec KEY, STATE1 2267 aesdec KEY, STATE2 2268 aesdec KEY, STATE3 2269 aesdec KEY, STATE4 2270 movaps -0x10(TKEYP), KEY 2271 aesdec KEY, STATE1 2272 aesdec KEY, STATE2 2273 aesdec KEY, STATE3 2274 aesdec KEY, STATE4 2275 movaps (TKEYP), KEY 2276 aesdec KEY, STATE1 2277 aesdec KEY, STATE2 2278 aesdec KEY, STATE3 2279 aesdec KEY, STATE4 2280 movaps 0x10(TKEYP), KEY 2281 aesdec KEY, STATE1 2282 aesdec KEY, STATE2 2283 aesdec KEY, STATE3 2284 aesdec KEY, STATE4 2285 movaps 0x20(TKEYP), KEY 2286 aesdec KEY, STATE1 2287 aesdec KEY, STATE2 2288 aesdec KEY, STATE3 2289 aesdec KEY, STATE4 2290 movaps 0x30(TKEYP), KEY 2291 aesdec KEY, STATE1 2292 aesdec KEY, STATE2 2293 aesdec KEY, STATE3 2294 aesdec KEY, STATE4 2295 movaps 0x40(TKEYP), KEY 2296 aesdec KEY, STATE1 2297 aesdec KEY, STATE2 2298 aesdec KEY, STATE3 2299 aesdec KEY, STATE4 2300 movaps 0x50(TKEYP), KEY 2301 aesdec KEY, STATE1 2302 aesdec KEY, STATE2 2303 aesdec KEY, STATE3 2304 aesdec KEY, STATE4 2305 movaps 0x60(TKEYP), KEY 2306 aesdec KEY, STATE1 2307 aesdec KEY, STATE2 2308 aesdec KEY, STATE3 2309 aesdec KEY, STATE4 2310 movaps 0x70(TKEYP), KEY 2311 aesdeclast KEY, STATE1 # last round 2312 aesdeclast KEY, STATE2 2313 aesdeclast KEY, STATE3 2314 aesdeclast KEY, STATE4 2315 ret 2316SYM_FUNC_END(_aesni_dec4) 2317 2318/* 2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2320 * size_t len) 2321 */ 2322SYM_FUNC_START(aesni_ecb_enc) 2323 FRAME_BEGIN 2324#ifndef __x86_64__ 2325 pushl LEN 2326 pushl KEYP 2327 pushl KLEN 2328 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2329 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2330 movl (FRAME_OFFSET+24)(%esp), INP # src 2331 movl (FRAME_OFFSET+28)(%esp), LEN # len 2332#endif 2333 test LEN, LEN # check length 2334 jz .Lecb_enc_ret 2335 mov 480(KEYP), KLEN 2336 cmp $16, LEN 2337 jb .Lecb_enc_ret 2338 cmp $64, LEN 2339 jb .Lecb_enc_loop1 2340.align 4 2341.Lecb_enc_loop4: 2342 movups (INP), STATE1 2343 movups 0x10(INP), STATE2 2344 movups 0x20(INP), STATE3 2345 movups 0x30(INP), STATE4 2346 call _aesni_enc4 2347 movups STATE1, (OUTP) 2348 movups STATE2, 0x10(OUTP) 2349 movups STATE3, 0x20(OUTP) 2350 movups STATE4, 0x30(OUTP) 2351 sub $64, LEN 2352 add $64, INP 2353 add $64, OUTP 2354 cmp $64, LEN 2355 jge .Lecb_enc_loop4 2356 cmp $16, LEN 2357 jb .Lecb_enc_ret 2358.align 4 2359.Lecb_enc_loop1: 2360 movups (INP), STATE1 2361 call _aesni_enc1 2362 movups STATE1, (OUTP) 2363 sub $16, LEN 2364 add $16, INP 2365 add $16, OUTP 2366 cmp $16, LEN 2367 jge .Lecb_enc_loop1 2368.Lecb_enc_ret: 2369#ifndef __x86_64__ 2370 popl KLEN 2371 popl KEYP 2372 popl LEN 2373#endif 2374 FRAME_END 2375 ret 2376SYM_FUNC_END(aesni_ecb_enc) 2377 2378/* 2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2380 * size_t len); 2381 */ 2382SYM_FUNC_START(aesni_ecb_dec) 2383 FRAME_BEGIN 2384#ifndef __x86_64__ 2385 pushl LEN 2386 pushl KEYP 2387 pushl KLEN 2388 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2389 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2390 movl (FRAME_OFFSET+24)(%esp), INP # src 2391 movl (FRAME_OFFSET+28)(%esp), LEN # len 2392#endif 2393 test LEN, LEN 2394 jz .Lecb_dec_ret 2395 mov 480(KEYP), KLEN 2396 add $240, KEYP 2397 cmp $16, LEN 2398 jb .Lecb_dec_ret 2399 cmp $64, LEN 2400 jb .Lecb_dec_loop1 2401.align 4 2402.Lecb_dec_loop4: 2403 movups (INP), STATE1 2404 movups 0x10(INP), STATE2 2405 movups 0x20(INP), STATE3 2406 movups 0x30(INP), STATE4 2407 call _aesni_dec4 2408 movups STATE1, (OUTP) 2409 movups STATE2, 0x10(OUTP) 2410 movups STATE3, 0x20(OUTP) 2411 movups STATE4, 0x30(OUTP) 2412 sub $64, LEN 2413 add $64, INP 2414 add $64, OUTP 2415 cmp $64, LEN 2416 jge .Lecb_dec_loop4 2417 cmp $16, LEN 2418 jb .Lecb_dec_ret 2419.align 4 2420.Lecb_dec_loop1: 2421 movups (INP), STATE1 2422 call _aesni_dec1 2423 movups STATE1, (OUTP) 2424 sub $16, LEN 2425 add $16, INP 2426 add $16, OUTP 2427 cmp $16, LEN 2428 jge .Lecb_dec_loop1 2429.Lecb_dec_ret: 2430#ifndef __x86_64__ 2431 popl KLEN 2432 popl KEYP 2433 popl LEN 2434#endif 2435 FRAME_END 2436 ret 2437SYM_FUNC_END(aesni_ecb_dec) 2438 2439/* 2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2441 * size_t len, u8 *iv) 2442 */ 2443SYM_FUNC_START(aesni_cbc_enc) 2444 FRAME_BEGIN 2445#ifndef __x86_64__ 2446 pushl IVP 2447 pushl LEN 2448 pushl KEYP 2449 pushl KLEN 2450 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2451 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2452 movl (FRAME_OFFSET+28)(%esp), INP # src 2453 movl (FRAME_OFFSET+32)(%esp), LEN # len 2454 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2455#endif 2456 cmp $16, LEN 2457 jb .Lcbc_enc_ret 2458 mov 480(KEYP), KLEN 2459 movups (IVP), STATE # load iv as initial state 2460.align 4 2461.Lcbc_enc_loop: 2462 movups (INP), IN # load input 2463 pxor IN, STATE 2464 call _aesni_enc1 2465 movups STATE, (OUTP) # store output 2466 sub $16, LEN 2467 add $16, INP 2468 add $16, OUTP 2469 cmp $16, LEN 2470 jge .Lcbc_enc_loop 2471 movups STATE, (IVP) 2472.Lcbc_enc_ret: 2473#ifndef __x86_64__ 2474 popl KLEN 2475 popl KEYP 2476 popl LEN 2477 popl IVP 2478#endif 2479 FRAME_END 2480 ret 2481SYM_FUNC_END(aesni_cbc_enc) 2482 2483/* 2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2485 * size_t len, u8 *iv) 2486 */ 2487SYM_FUNC_START(aesni_cbc_dec) 2488 FRAME_BEGIN 2489#ifndef __x86_64__ 2490 pushl IVP 2491 pushl LEN 2492 pushl KEYP 2493 pushl KLEN 2494 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2495 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2496 movl (FRAME_OFFSET+28)(%esp), INP # src 2497 movl (FRAME_OFFSET+32)(%esp), LEN # len 2498 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2499#endif 2500 cmp $16, LEN 2501 jb .Lcbc_dec_just_ret 2502 mov 480(KEYP), KLEN 2503 add $240, KEYP 2504 movups (IVP), IV 2505 cmp $64, LEN 2506 jb .Lcbc_dec_loop1 2507.align 4 2508.Lcbc_dec_loop4: 2509 movups (INP), IN1 2510 movaps IN1, STATE1 2511 movups 0x10(INP), IN2 2512 movaps IN2, STATE2 2513#ifdef __x86_64__ 2514 movups 0x20(INP), IN3 2515 movaps IN3, STATE3 2516 movups 0x30(INP), IN4 2517 movaps IN4, STATE4 2518#else 2519 movups 0x20(INP), IN1 2520 movaps IN1, STATE3 2521 movups 0x30(INP), IN2 2522 movaps IN2, STATE4 2523#endif 2524 call _aesni_dec4 2525 pxor IV, STATE1 2526#ifdef __x86_64__ 2527 pxor IN1, STATE2 2528 pxor IN2, STATE3 2529 pxor IN3, STATE4 2530 movaps IN4, IV 2531#else 2532 pxor IN1, STATE4 2533 movaps IN2, IV 2534 movups (INP), IN1 2535 pxor IN1, STATE2 2536 movups 0x10(INP), IN2 2537 pxor IN2, STATE3 2538#endif 2539 movups STATE1, (OUTP) 2540 movups STATE2, 0x10(OUTP) 2541 movups STATE3, 0x20(OUTP) 2542 movups STATE4, 0x30(OUTP) 2543 sub $64, LEN 2544 add $64, INP 2545 add $64, OUTP 2546 cmp $64, LEN 2547 jge .Lcbc_dec_loop4 2548 cmp $16, LEN 2549 jb .Lcbc_dec_ret 2550.align 4 2551.Lcbc_dec_loop1: 2552 movups (INP), IN 2553 movaps IN, STATE 2554 call _aesni_dec1 2555 pxor IV, STATE 2556 movups STATE, (OUTP) 2557 movaps IN, IV 2558 sub $16, LEN 2559 add $16, INP 2560 add $16, OUTP 2561 cmp $16, LEN 2562 jge .Lcbc_dec_loop1 2563.Lcbc_dec_ret: 2564 movups IV, (IVP) 2565.Lcbc_dec_just_ret: 2566#ifndef __x86_64__ 2567 popl KLEN 2568 popl KEYP 2569 popl LEN 2570 popl IVP 2571#endif 2572 FRAME_END 2573 ret 2574SYM_FUNC_END(aesni_cbc_dec) 2575 2576/* 2577 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2578 * size_t len, u8 *iv) 2579 */ 2580SYM_FUNC_START(aesni_cts_cbc_enc) 2581 FRAME_BEGIN 2582#ifndef __x86_64__ 2583 pushl IVP 2584 pushl LEN 2585 pushl KEYP 2586 pushl KLEN 2587 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2588 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2589 movl (FRAME_OFFSET+28)(%esp), INP # src 2590 movl (FRAME_OFFSET+32)(%esp), LEN # len 2591 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2592 lea .Lcts_permute_table, T1 2593#else 2594 lea .Lcts_permute_table(%rip), T1 2595#endif 2596 mov 480(KEYP), KLEN 2597 movups (IVP), STATE 2598 sub $16, LEN 2599 mov T1, IVP 2600 add $32, IVP 2601 add LEN, T1 2602 sub LEN, IVP 2603 movups (T1), %xmm4 2604 movups (IVP), %xmm5 2605 2606 movups (INP), IN1 2607 add LEN, INP 2608 movups (INP), IN2 2609 2610 pxor IN1, STATE 2611 call _aesni_enc1 2612 2613 pshufb %xmm5, IN2 2614 pxor STATE, IN2 2615 pshufb %xmm4, STATE 2616 add OUTP, LEN 2617 movups STATE, (LEN) 2618 2619 movaps IN2, STATE 2620 call _aesni_enc1 2621 movups STATE, (OUTP) 2622 2623#ifndef __x86_64__ 2624 popl KLEN 2625 popl KEYP 2626 popl LEN 2627 popl IVP 2628#endif 2629 FRAME_END 2630 ret 2631SYM_FUNC_END(aesni_cts_cbc_enc) 2632 2633/* 2634 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2635 * size_t len, u8 *iv) 2636 */ 2637SYM_FUNC_START(aesni_cts_cbc_dec) 2638 FRAME_BEGIN 2639#ifndef __x86_64__ 2640 pushl IVP 2641 pushl LEN 2642 pushl KEYP 2643 pushl KLEN 2644 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2645 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2646 movl (FRAME_OFFSET+28)(%esp), INP # src 2647 movl (FRAME_OFFSET+32)(%esp), LEN # len 2648 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2649 lea .Lcts_permute_table, T1 2650#else 2651 lea .Lcts_permute_table(%rip), T1 2652#endif 2653 mov 480(KEYP), KLEN 2654 add $240, KEYP 2655 movups (IVP), IV 2656 sub $16, LEN 2657 mov T1, IVP 2658 add $32, IVP 2659 add LEN, T1 2660 sub LEN, IVP 2661 movups (T1), %xmm4 2662 2663 movups (INP), STATE 2664 add LEN, INP 2665 movups (INP), IN1 2666 2667 call _aesni_dec1 2668 movaps STATE, IN2 2669 pshufb %xmm4, STATE 2670 pxor IN1, STATE 2671 2672 add OUTP, LEN 2673 movups STATE, (LEN) 2674 2675 movups (IVP), %xmm0 2676 pshufb %xmm0, IN1 2677 pblendvb IN2, IN1 2678 movaps IN1, STATE 2679 call _aesni_dec1 2680 2681 pxor IV, STATE 2682 movups STATE, (OUTP) 2683 2684#ifndef __x86_64__ 2685 popl KLEN 2686 popl KEYP 2687 popl LEN 2688 popl IVP 2689#endif 2690 FRAME_END 2691 ret 2692SYM_FUNC_END(aesni_cts_cbc_dec) 2693 2694.pushsection .rodata 2695.align 16 2696.Lcts_permute_table: 2697 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2698 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2699 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 2700 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 2701 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2702 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2703#ifdef __x86_64__ 2704.Lbswap_mask: 2705 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2706#endif 2707.popsection 2708 2709#ifdef __x86_64__ 2710/* 2711 * _aesni_inc_init: internal ABI 2712 * setup registers used by _aesni_inc 2713 * input: 2714 * IV 2715 * output: 2716 * CTR: == IV, in little endian 2717 * TCTR_LOW: == lower qword of CTR 2718 * INC: == 1, in little endian 2719 * BSWAP_MASK == endian swapping mask 2720 */ 2721SYM_FUNC_START_LOCAL(_aesni_inc_init) 2722 movaps .Lbswap_mask, BSWAP_MASK 2723 movaps IV, CTR 2724 pshufb BSWAP_MASK, CTR 2725 mov $1, TCTR_LOW 2726 movq TCTR_LOW, INC 2727 movq CTR, TCTR_LOW 2728 ret 2729SYM_FUNC_END(_aesni_inc_init) 2730 2731/* 2732 * _aesni_inc: internal ABI 2733 * Increase IV by 1, IV is in big endian 2734 * input: 2735 * IV 2736 * CTR: == IV, in little endian 2737 * TCTR_LOW: == lower qword of CTR 2738 * INC: == 1, in little endian 2739 * BSWAP_MASK == endian swapping mask 2740 * output: 2741 * IV: Increase by 1 2742 * changed: 2743 * CTR: == output IV, in little endian 2744 * TCTR_LOW: == lower qword of CTR 2745 */ 2746SYM_FUNC_START_LOCAL(_aesni_inc) 2747 paddq INC, CTR 2748 add $1, TCTR_LOW 2749 jnc .Linc_low 2750 pslldq $8, INC 2751 paddq INC, CTR 2752 psrldq $8, INC 2753.Linc_low: 2754 movaps CTR, IV 2755 pshufb BSWAP_MASK, IV 2756 ret 2757SYM_FUNC_END(_aesni_inc) 2758 2759/* 2760 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2761 * size_t len, u8 *iv) 2762 */ 2763SYM_FUNC_START(aesni_ctr_enc) 2764 FRAME_BEGIN 2765 cmp $16, LEN 2766 jb .Lctr_enc_just_ret 2767 mov 480(KEYP), KLEN 2768 movups (IVP), IV 2769 call _aesni_inc_init 2770 cmp $64, LEN 2771 jb .Lctr_enc_loop1 2772.align 4 2773.Lctr_enc_loop4: 2774 movaps IV, STATE1 2775 call _aesni_inc 2776 movups (INP), IN1 2777 movaps IV, STATE2 2778 call _aesni_inc 2779 movups 0x10(INP), IN2 2780 movaps IV, STATE3 2781 call _aesni_inc 2782 movups 0x20(INP), IN3 2783 movaps IV, STATE4 2784 call _aesni_inc 2785 movups 0x30(INP), IN4 2786 call _aesni_enc4 2787 pxor IN1, STATE1 2788 movups STATE1, (OUTP) 2789 pxor IN2, STATE2 2790 movups STATE2, 0x10(OUTP) 2791 pxor IN3, STATE3 2792 movups STATE3, 0x20(OUTP) 2793 pxor IN4, STATE4 2794 movups STATE4, 0x30(OUTP) 2795 sub $64, LEN 2796 add $64, INP 2797 add $64, OUTP 2798 cmp $64, LEN 2799 jge .Lctr_enc_loop4 2800 cmp $16, LEN 2801 jb .Lctr_enc_ret 2802.align 4 2803.Lctr_enc_loop1: 2804 movaps IV, STATE 2805 call _aesni_inc 2806 movups (INP), IN 2807 call _aesni_enc1 2808 pxor IN, STATE 2809 movups STATE, (OUTP) 2810 sub $16, LEN 2811 add $16, INP 2812 add $16, OUTP 2813 cmp $16, LEN 2814 jge .Lctr_enc_loop1 2815.Lctr_enc_ret: 2816 movups IV, (IVP) 2817.Lctr_enc_just_ret: 2818 FRAME_END 2819 ret 2820SYM_FUNC_END(aesni_ctr_enc) 2821 2822#endif 2823 2824.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 2825.align 16 2826.Lgf128mul_x_ble_mask: 2827 .octa 0x00000000000000010000000000000087 2828.previous 2829 2830/* 2831 * _aesni_gf128mul_x_ble: internal ABI 2832 * Multiply in GF(2^128) for XTS IVs 2833 * input: 2834 * IV: current IV 2835 * GF128MUL_MASK == mask with 0x87 and 0x01 2836 * output: 2837 * IV: next IV 2838 * changed: 2839 * CTR: == temporary value 2840 */ 2841#define _aesni_gf128mul_x_ble() \ 2842 pshufd $0x13, IV, KEY; \ 2843 paddq IV, IV; \ 2844 psrad $31, KEY; \ 2845 pand GF128MUL_MASK, KEY; \ 2846 pxor KEY, IV; 2847 2848/* 2849 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, 2850 * const u8 *src, unsigned int len, le128 *iv) 2851 */ 2852SYM_FUNC_START(aesni_xts_encrypt) 2853 FRAME_BEGIN 2854#ifndef __x86_64__ 2855 pushl IVP 2856 pushl LEN 2857 pushl KEYP 2858 pushl KLEN 2859 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2860 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2861 movl (FRAME_OFFSET+28)(%esp), INP # src 2862 movl (FRAME_OFFSET+32)(%esp), LEN # len 2863 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2864 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2865#else 2866 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 2867#endif 2868 movups (IVP), IV 2869 2870 mov 480(KEYP), KLEN 2871 2872.Lxts_enc_loop4: 2873 sub $64, LEN 2874 jl .Lxts_enc_1x 2875 2876 movdqa IV, STATE1 2877 movdqu 0x00(INP), IN 2878 pxor IN, STATE1 2879 movdqu IV, 0x00(OUTP) 2880 2881 _aesni_gf128mul_x_ble() 2882 movdqa IV, STATE2 2883 movdqu 0x10(INP), IN 2884 pxor IN, STATE2 2885 movdqu IV, 0x10(OUTP) 2886 2887 _aesni_gf128mul_x_ble() 2888 movdqa IV, STATE3 2889 movdqu 0x20(INP), IN 2890 pxor IN, STATE3 2891 movdqu IV, 0x20(OUTP) 2892 2893 _aesni_gf128mul_x_ble() 2894 movdqa IV, STATE4 2895 movdqu 0x30(INP), IN 2896 pxor IN, STATE4 2897 movdqu IV, 0x30(OUTP) 2898 2899 call _aesni_enc4 2900 2901 movdqu 0x00(OUTP), IN 2902 pxor IN, STATE1 2903 movdqu STATE1, 0x00(OUTP) 2904 2905 movdqu 0x10(OUTP), IN 2906 pxor IN, STATE2 2907 movdqu STATE2, 0x10(OUTP) 2908 2909 movdqu 0x20(OUTP), IN 2910 pxor IN, STATE3 2911 movdqu STATE3, 0x20(OUTP) 2912 2913 movdqu 0x30(OUTP), IN 2914 pxor IN, STATE4 2915 movdqu STATE4, 0x30(OUTP) 2916 2917 _aesni_gf128mul_x_ble() 2918 2919 add $64, INP 2920 add $64, OUTP 2921 test LEN, LEN 2922 jnz .Lxts_enc_loop4 2923 2924.Lxts_enc_ret_iv: 2925 movups IV, (IVP) 2926 2927.Lxts_enc_ret: 2928#ifndef __x86_64__ 2929 popl KLEN 2930 popl KEYP 2931 popl LEN 2932 popl IVP 2933#endif 2934 FRAME_END 2935 ret 2936 2937.Lxts_enc_1x: 2938 add $64, LEN 2939 jz .Lxts_enc_ret_iv 2940 sub $16, LEN 2941 jl .Lxts_enc_cts4 2942 2943.Lxts_enc_loop1: 2944 movdqu (INP), STATE 2945 pxor IV, STATE 2946 call _aesni_enc1 2947 pxor IV, STATE 2948 _aesni_gf128mul_x_ble() 2949 2950 test LEN, LEN 2951 jz .Lxts_enc_out 2952 2953 add $16, INP 2954 sub $16, LEN 2955 jl .Lxts_enc_cts1 2956 2957 movdqu STATE, (OUTP) 2958 add $16, OUTP 2959 jmp .Lxts_enc_loop1 2960 2961.Lxts_enc_out: 2962 movdqu STATE, (OUTP) 2963 jmp .Lxts_enc_ret_iv 2964 2965.Lxts_enc_cts4: 2966 movdqa STATE4, STATE 2967 sub $16, OUTP 2968 2969.Lxts_enc_cts1: 2970#ifndef __x86_64__ 2971 lea .Lcts_permute_table, T1 2972#else 2973 lea .Lcts_permute_table(%rip), T1 2974#endif 2975 add LEN, INP /* rewind input pointer */ 2976 add $16, LEN /* # bytes in final block */ 2977 movups (INP), IN1 2978 2979 mov T1, IVP 2980 add $32, IVP 2981 add LEN, T1 2982 sub LEN, IVP 2983 add OUTP, LEN 2984 2985 movups (T1), %xmm4 2986 movaps STATE, IN2 2987 pshufb %xmm4, STATE 2988 movups STATE, (LEN) 2989 2990 movups (IVP), %xmm0 2991 pshufb %xmm0, IN1 2992 pblendvb IN2, IN1 2993 movaps IN1, STATE 2994 2995 pxor IV, STATE 2996 call _aesni_enc1 2997 pxor IV, STATE 2998 2999 movups STATE, (OUTP) 3000 jmp .Lxts_enc_ret 3001SYM_FUNC_END(aesni_xts_encrypt) 3002 3003/* 3004 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, 3005 * const u8 *src, unsigned int len, le128 *iv) 3006 */ 3007SYM_FUNC_START(aesni_xts_decrypt) 3008 FRAME_BEGIN 3009#ifndef __x86_64__ 3010 pushl IVP 3011 pushl LEN 3012 pushl KEYP 3013 pushl KLEN 3014 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 3015 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 3016 movl (FRAME_OFFSET+28)(%esp), INP # src 3017 movl (FRAME_OFFSET+32)(%esp), LEN # len 3018 movl (FRAME_OFFSET+36)(%esp), IVP # iv 3019 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 3020#else 3021 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 3022#endif 3023 movups (IVP), IV 3024 3025 mov 480(KEYP), KLEN 3026 add $240, KEYP 3027 3028 test $15, LEN 3029 jz .Lxts_dec_loop4 3030 sub $16, LEN 3031 3032.Lxts_dec_loop4: 3033 sub $64, LEN 3034 jl .Lxts_dec_1x 3035 3036 movdqa IV, STATE1 3037 movdqu 0x00(INP), IN 3038 pxor IN, STATE1 3039 movdqu IV, 0x00(OUTP) 3040 3041 _aesni_gf128mul_x_ble() 3042 movdqa IV, STATE2 3043 movdqu 0x10(INP), IN 3044 pxor IN, STATE2 3045 movdqu IV, 0x10(OUTP) 3046 3047 _aesni_gf128mul_x_ble() 3048 movdqa IV, STATE3 3049 movdqu 0x20(INP), IN 3050 pxor IN, STATE3 3051 movdqu IV, 0x20(OUTP) 3052 3053 _aesni_gf128mul_x_ble() 3054 movdqa IV, STATE4 3055 movdqu 0x30(INP), IN 3056 pxor IN, STATE4 3057 movdqu IV, 0x30(OUTP) 3058 3059 call _aesni_dec4 3060 3061 movdqu 0x00(OUTP), IN 3062 pxor IN, STATE1 3063 movdqu STATE1, 0x00(OUTP) 3064 3065 movdqu 0x10(OUTP), IN 3066 pxor IN, STATE2 3067 movdqu STATE2, 0x10(OUTP) 3068 3069 movdqu 0x20(OUTP), IN 3070 pxor IN, STATE3 3071 movdqu STATE3, 0x20(OUTP) 3072 3073 movdqu 0x30(OUTP), IN 3074 pxor IN, STATE4 3075 movdqu STATE4, 0x30(OUTP) 3076 3077 _aesni_gf128mul_x_ble() 3078 3079 add $64, INP 3080 add $64, OUTP 3081 test LEN, LEN 3082 jnz .Lxts_dec_loop4 3083 3084.Lxts_dec_ret_iv: 3085 movups IV, (IVP) 3086 3087.Lxts_dec_ret: 3088#ifndef __x86_64__ 3089 popl KLEN 3090 popl KEYP 3091 popl LEN 3092 popl IVP 3093#endif 3094 FRAME_END 3095 ret 3096 3097.Lxts_dec_1x: 3098 add $64, LEN 3099 jz .Lxts_dec_ret_iv 3100 3101.Lxts_dec_loop1: 3102 movdqu (INP), STATE 3103 3104 add $16, INP 3105 sub $16, LEN 3106 jl .Lxts_dec_cts1 3107 3108 pxor IV, STATE 3109 call _aesni_dec1 3110 pxor IV, STATE 3111 _aesni_gf128mul_x_ble() 3112 3113 test LEN, LEN 3114 jz .Lxts_dec_out 3115 3116 movdqu STATE, (OUTP) 3117 add $16, OUTP 3118 jmp .Lxts_dec_loop1 3119 3120.Lxts_dec_out: 3121 movdqu STATE, (OUTP) 3122 jmp .Lxts_dec_ret_iv 3123 3124.Lxts_dec_cts1: 3125 movdqa IV, STATE4 3126 _aesni_gf128mul_x_ble() 3127 3128 pxor IV, STATE 3129 call _aesni_dec1 3130 pxor IV, STATE 3131 3132#ifndef __x86_64__ 3133 lea .Lcts_permute_table, T1 3134#else 3135 lea .Lcts_permute_table(%rip), T1 3136#endif 3137 add LEN, INP /* rewind input pointer */ 3138 add $16, LEN /* # bytes in final block */ 3139 movups (INP), IN1 3140 3141 mov T1, IVP 3142 add $32, IVP 3143 add LEN, T1 3144 sub LEN, IVP 3145 add OUTP, LEN 3146 3147 movups (T1), %xmm4 3148 movaps STATE, IN2 3149 pshufb %xmm4, STATE 3150 movups STATE, (LEN) 3151 3152 movups (IVP), %xmm0 3153 pshufb %xmm0, IN1 3154 pblendvb IN2, IN1 3155 movaps IN1, STATE 3156 3157 pxor STATE4, STATE 3158 call _aesni_dec1 3159 pxor STATE4, STATE 3160 3161 movups STATE, (OUTP) 3162 jmp .Lxts_dec_ret 3163SYM_FUNC_END(aesni_xts_decrypt) 3164