1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 * interface for 64-bit kernels. 15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 * Adrian Hoban <adrian.hoban@intel.com> 18 * James Guilford (james.guilford@intel.com) 19 * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 * Tadeusz Struk (tadeusz.struk@intel.com) 21 * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 * Copyright (c) 2010, Intel Corporation. 23 * 24 * Ported x86_64 version to x86: 25 * Author: Mathias Krause <minipli@googlemail.com> 26 */ 27 28#include <linux/linkage.h> 29#include <asm/frame.h> 30#include <asm/nospec-branch.h> 31 32/* 33 * The following macros are used to move an (un)aligned 16 byte value to/from 34 * an XMM register. This can done for either FP or integer values, for FP use 35 * movaps (move aligned packed single) or integer use movdqa (move double quad 36 * aligned). It doesn't make a performance difference which instruction is used 37 * since Nehalem (original Core i7) was released. However, the movaps is a byte 38 * shorter, so that is the one we'll use for now. (same for unaligned). 39 */ 40#define MOVADQ movaps 41#define MOVUDQ movups 42 43#ifdef __x86_64__ 44 45# constants in mergeable sections, linker can reorder and merge 46.section .rodata.cst16.POLY, "aM", @progbits, 16 47.align 16 48POLY: .octa 0xC2000000000000000000000000000001 49.section .rodata.cst16.TWOONE, "aM", @progbits, 16 50.align 16 51TWOONE: .octa 0x00000001000000000000000000000001 52 53.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 54.align 16 55SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 56.section .rodata.cst16.MASK1, "aM", @progbits, 16 57.align 16 58MASK1: .octa 0x0000000000000000ffffffffffffffff 59.section .rodata.cst16.MASK2, "aM", @progbits, 16 60.align 16 61MASK2: .octa 0xffffffffffffffff0000000000000000 62.section .rodata.cst16.ONE, "aM", @progbits, 16 63.align 16 64ONE: .octa 0x00000000000000000000000000000001 65.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 66.align 16 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 68.section .rodata.cst16.dec, "aM", @progbits, 16 69.align 16 70dec: .octa 0x1 71.section .rodata.cst16.enc, "aM", @progbits, 16 72.align 16 73enc: .octa 0x2 74 75# order of these constants should not change. 76# more specifically, ALL_F should follow SHIFT_MASK, 77# and zero should follow ALL_F 78.section .rodata, "a", @progbits 79.align 16 80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 81ALL_F: .octa 0xffffffffffffffffffffffffffffffff 82 .octa 0x00000000000000000000000000000000 83 84.text 85 86 87#define STACK_OFFSET 8*3 88 89#define AadHash 16*0 90#define AadLen 16*1 91#define InLen (16*1)+8 92#define PBlockEncKey 16*2 93#define OrigIV 16*3 94#define CurCount 16*4 95#define PBlockLen 16*5 96#define HashKey 16*6 // store HashKey <<1 mod poly here 97#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 98#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 99#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 100#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 101 // bits of HashKey <<1 mod poly here 102 //(for Karatsuba purposes) 103#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 104 // bits of HashKey^2 <<1 mod poly here 105 // (for Karatsuba purposes) 106#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 107 // bits of HashKey^3 <<1 mod poly here 108 // (for Karatsuba purposes) 109#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 110 // bits of HashKey^4 <<1 mod poly here 111 // (for Karatsuba purposes) 112 113#define arg1 rdi 114#define arg2 rsi 115#define arg3 rdx 116#define arg4 rcx 117#define arg5 r8 118#define arg6 r9 119#define arg7 STACK_OFFSET+8(%rsp) 120#define arg8 STACK_OFFSET+16(%rsp) 121#define arg9 STACK_OFFSET+24(%rsp) 122#define arg10 STACK_OFFSET+32(%rsp) 123#define arg11 STACK_OFFSET+40(%rsp) 124#define keysize 2*15*16(%arg1) 125#endif 126 127 128#define STATE1 %xmm0 129#define STATE2 %xmm4 130#define STATE3 %xmm5 131#define STATE4 %xmm6 132#define STATE STATE1 133#define IN1 %xmm1 134#define IN2 %xmm7 135#define IN3 %xmm8 136#define IN4 %xmm9 137#define IN IN1 138#define KEY %xmm2 139#define IV %xmm3 140 141#define BSWAP_MASK %xmm10 142#define CTR %xmm11 143#define INC %xmm12 144 145#define GF128MUL_MASK %xmm7 146 147#ifdef __x86_64__ 148#define AREG %rax 149#define KEYP %rdi 150#define OUTP %rsi 151#define UKEYP OUTP 152#define INP %rdx 153#define LEN %rcx 154#define IVP %r8 155#define KLEN %r9d 156#define T1 %r10 157#define TKEYP T1 158#define T2 %r11 159#define TCTR_LOW T2 160#else 161#define AREG %eax 162#define KEYP %edi 163#define OUTP AREG 164#define UKEYP OUTP 165#define INP %edx 166#define LEN %esi 167#define IVP %ebp 168#define KLEN %ebx 169#define T1 %ecx 170#define TKEYP T1 171#endif 172 173.macro FUNC_SAVE 174 push %r12 175 push %r13 176 push %r14 177# 178# states of %xmm registers %xmm6:%xmm15 not saved 179# all %xmm registers are clobbered 180# 181.endm 182 183 184.macro FUNC_RESTORE 185 pop %r14 186 pop %r13 187 pop %r12 188.endm 189 190# Precompute hashkeys. 191# Input: Hash subkey. 192# Output: HashKeys stored in gcm_context_data. Only needs to be called 193# once per key. 194# clobbers r12, and tmp xmm registers. 195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 196 mov \SUBKEY, %r12 197 movdqu (%r12), \TMP3 198 movdqa SHUF_MASK(%rip), \TMP2 199 pshufb \TMP2, \TMP3 200 201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 202 203 movdqa \TMP3, \TMP2 204 psllq $1, \TMP3 205 psrlq $63, \TMP2 206 movdqa \TMP2, \TMP1 207 pslldq $8, \TMP2 208 psrldq $8, \TMP1 209 por \TMP2, \TMP3 210 211 # reduce HashKey<<1 212 213 pshufd $0x24, \TMP1, \TMP2 214 pcmpeqd TWOONE(%rip), \TMP2 215 pand POLY(%rip), \TMP2 216 pxor \TMP2, \TMP3 217 movdqu \TMP3, HashKey(%arg2) 218 219 movdqa \TMP3, \TMP5 220 pshufd $78, \TMP3, \TMP1 221 pxor \TMP3, \TMP1 222 movdqu \TMP1, HashKey_k(%arg2) 223 224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 225# TMP5 = HashKey^2<<1 (mod poly) 226 movdqu \TMP5, HashKey_2(%arg2) 227# HashKey_2 = HashKey^2<<1 (mod poly) 228 pshufd $78, \TMP5, \TMP1 229 pxor \TMP5, \TMP1 230 movdqu \TMP1, HashKey_2_k(%arg2) 231 232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 233# TMP5 = HashKey^3<<1 (mod poly) 234 movdqu \TMP5, HashKey_3(%arg2) 235 pshufd $78, \TMP5, \TMP1 236 pxor \TMP5, \TMP1 237 movdqu \TMP1, HashKey_3_k(%arg2) 238 239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 240# TMP5 = HashKey^3<<1 (mod poly) 241 movdqu \TMP5, HashKey_4(%arg2) 242 pshufd $78, \TMP5, \TMP1 243 pxor \TMP5, \TMP1 244 movdqu \TMP1, HashKey_4_k(%arg2) 245.endm 246 247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 249.macro GCM_INIT Iv SUBKEY AAD AADLEN 250 mov \AADLEN, %r11 251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 252 xor %r11d, %r11d 253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 256 mov \Iv, %rax 257 movdqu (%rax), %xmm0 258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 259 260 movdqa SHUF_MASK(%rip), %xmm2 261 pshufb %xmm2, %xmm0 262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 263 264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 265 movdqu HashKey(%arg2), %xmm13 266 267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 268 %xmm4, %xmm5, %xmm6 269.endm 270 271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 272# struct has been initialized by GCM_INIT. 273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 274# Clobbers rax, r10-r13, and xmm0-xmm15 275.macro GCM_ENC_DEC operation 276 movdqu AadHash(%arg2), %xmm8 277 movdqu HashKey(%arg2), %xmm13 278 add %arg5, InLen(%arg2) 279 280 xor %r11d, %r11d # initialise the data pointer offset as zero 281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 282 283 sub %r11, %arg5 # sub partial block data used 284 mov %arg5, %r13 # save the number of bytes 285 286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 287 mov %r13, %r12 288 # Encrypt/Decrypt first few blocks 289 290 and $(3<<4), %r12 291 jz .L_initial_num_blocks_is_0_\@ 292 cmp $(2<<4), %r12 293 jb .L_initial_num_blocks_is_1_\@ 294 je .L_initial_num_blocks_is_2_\@ 295.L_initial_num_blocks_is_3_\@: 296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 298 sub $48, %r13 299 jmp .L_initial_blocks_\@ 300.L_initial_num_blocks_is_2_\@: 301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 303 sub $32, %r13 304 jmp .L_initial_blocks_\@ 305.L_initial_num_blocks_is_1_\@: 306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 308 sub $16, %r13 309 jmp .L_initial_blocks_\@ 310.L_initial_num_blocks_is_0_\@: 311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 313.L_initial_blocks_\@: 314 315 # Main loop - Encrypt/Decrypt remaining blocks 316 317 test %r13, %r13 318 je .L_zero_cipher_left_\@ 319 sub $64, %r13 320 je .L_four_cipher_left_\@ 321.L_crypt_by_4_\@: 322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 324 %xmm7, %xmm8, enc 325 add $64, %r11 326 sub $64, %r13 327 jne .L_crypt_by_4_\@ 328.L_four_cipher_left_\@: 329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 331.L_zero_cipher_left_\@: 332 movdqu %xmm8, AadHash(%arg2) 333 movdqu %xmm0, CurCount(%arg2) 334 335 mov %arg5, %r13 336 and $15, %r13 # %r13 = arg5 (mod 16) 337 je .L_multiple_of_16_bytes_\@ 338 339 mov %r13, PBlockLen(%arg2) 340 341 # Handle the last <16 Byte block separately 342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 343 movdqu %xmm0, CurCount(%arg2) 344 movdqa SHUF_MASK(%rip), %xmm10 345 pshufb %xmm10, %xmm0 346 347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 348 movdqu %xmm0, PBlockEncKey(%arg2) 349 350 cmp $16, %arg5 351 jge .L_large_enough_update_\@ 352 353 lea (%arg4,%r11,1), %r10 354 mov %r13, %r12 355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 356 jmp .L_data_read_\@ 357 358.L_large_enough_update_\@: 359 sub $16, %r11 360 add %r13, %r11 361 362 # receive the last <16 Byte block 363 movdqu (%arg4, %r11, 1), %xmm1 364 365 sub %r13, %r11 366 add $16, %r11 367 368 lea SHIFT_MASK+16(%rip), %r12 369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 370 # (r13 is the number of bytes in plaintext mod 16) 371 sub %r13, %r12 372 # get the appropriate shuffle mask 373 movdqu (%r12), %xmm2 374 # shift right 16-r13 bytes 375 pshufb %xmm2, %xmm1 376 377.L_data_read_\@: 378 lea ALL_F+16(%rip), %r12 379 sub %r13, %r12 380 381.ifc \operation, dec 382 movdqa %xmm1, %xmm2 383.endif 384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 385 movdqu (%r12), %xmm1 386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 388.ifc \operation, dec 389 pand %xmm1, %xmm2 390 movdqa SHUF_MASK(%rip), %xmm10 391 pshufb %xmm10 ,%xmm2 392 393 pxor %xmm2, %xmm8 394.else 395 movdqa SHUF_MASK(%rip), %xmm10 396 pshufb %xmm10,%xmm0 397 398 pxor %xmm0, %xmm8 399.endif 400 401 movdqu %xmm8, AadHash(%arg2) 402.ifc \operation, enc 403 # GHASH computation for the last <16 byte block 404 movdqa SHUF_MASK(%rip), %xmm10 405 # shuffle xmm0 back to output as ciphertext 406 pshufb %xmm10, %xmm0 407.endif 408 409 # Output %r13 bytes 410 movq %xmm0, %rax 411 cmp $8, %r13 412 jle .L_less_than_8_bytes_left_\@ 413 mov %rax, (%arg3 , %r11, 1) 414 add $8, %r11 415 psrldq $8, %xmm0 416 movq %xmm0, %rax 417 sub $8, %r13 418.L_less_than_8_bytes_left_\@: 419 mov %al, (%arg3, %r11, 1) 420 add $1, %r11 421 shr $8, %rax 422 sub $1, %r13 423 jne .L_less_than_8_bytes_left_\@ 424.L_multiple_of_16_bytes_\@: 425.endm 426 427# GCM_COMPLETE Finishes update of tag of last partial block 428# Output: Authorization Tag (AUTH_TAG) 429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 431 movdqu AadHash(%arg2), %xmm8 432 movdqu HashKey(%arg2), %xmm13 433 434 mov PBlockLen(%arg2), %r12 435 436 test %r12, %r12 437 je .L_partial_done\@ 438 439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 440 441.L_partial_done\@: 442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 443 shl $3, %r12 # convert into number of bits 444 movd %r12d, %xmm15 # len(A) in %xmm15 445 mov InLen(%arg2), %r12 446 shl $3, %r12 # len(C) in bits (*128) 447 movq %r12, %xmm1 448 449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 451 pxor %xmm15, %xmm8 452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 453 # final GHASH computation 454 movdqa SHUF_MASK(%rip), %xmm10 455 pshufb %xmm10, %xmm8 456 457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 459 pxor %xmm8, %xmm0 460.L_return_T_\@: 461 mov \AUTHTAG, %r10 # %r10 = authTag 462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 463 cmp $16, %r11 464 je .L_T_16_\@ 465 cmp $8, %r11 466 jl .L_T_4_\@ 467.L_T_8_\@: 468 movq %xmm0, %rax 469 mov %rax, (%r10) 470 add $8, %r10 471 sub $8, %r11 472 psrldq $8, %xmm0 473 test %r11, %r11 474 je .L_return_T_done_\@ 475.L_T_4_\@: 476 movd %xmm0, %eax 477 mov %eax, (%r10) 478 add $4, %r10 479 sub $4, %r11 480 psrldq $4, %xmm0 481 test %r11, %r11 482 je .L_return_T_done_\@ 483.L_T_123_\@: 484 movd %xmm0, %eax 485 cmp $2, %r11 486 jl .L_T_1_\@ 487 mov %ax, (%r10) 488 cmp $2, %r11 489 je .L_return_T_done_\@ 490 add $2, %r10 491 sar $16, %eax 492.L_T_1_\@: 493 mov %al, (%r10) 494 jmp .L_return_T_done_\@ 495.L_T_16_\@: 496 movdqu %xmm0, (%r10) 497.L_return_T_done_\@: 498.endm 499 500#ifdef __x86_64__ 501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 502* 503* 504* Input: A and B (128-bits each, bit-reflected) 505* Output: C = A*B*x mod poly, (i.e. >>1 ) 506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 508* 509*/ 510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 511 movdqa \GH, \TMP1 512 pshufd $78, \GH, \TMP2 513 pshufd $78, \HK, \TMP3 514 pxor \GH, \TMP2 # TMP2 = a1+a0 515 pxor \HK, \TMP3 # TMP3 = b1+b0 516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0 518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 519 pxor \GH, \TMP2 520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 521 movdqa \TMP2, \TMP3 522 pslldq $8, \TMP3 # left shift TMP3 2 DWs 523 psrldq $8, \TMP2 # right shift TMP2 2 DWs 524 pxor \TMP3, \GH 525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 526 527 # first phase of the reduction 528 529 movdqa \GH, \TMP2 530 movdqa \GH, \TMP3 531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 532 # in in order to perform 533 # independent shifts 534 pslld $31, \TMP2 # packed right shift <<31 535 pslld $30, \TMP3 # packed right shift <<30 536 pslld $25, \TMP4 # packed right shift <<25 537 pxor \TMP3, \TMP2 # xor the shifted versions 538 pxor \TMP4, \TMP2 539 movdqa \TMP2, \TMP5 540 psrldq $4, \TMP5 # right shift TMP5 1 DW 541 pslldq $12, \TMP2 # left shift TMP2 3 DWs 542 pxor \TMP2, \GH 543 544 # second phase of the reduction 545 546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 547 # in in order to perform 548 # independent shifts 549 movdqa \GH,\TMP3 550 movdqa \GH,\TMP4 551 psrld $1,\TMP2 # packed left shift >>1 552 psrld $2,\TMP3 # packed left shift >>2 553 psrld $7,\TMP4 # packed left shift >>7 554 pxor \TMP3,\TMP2 # xor the shifted versions 555 pxor \TMP4,\TMP2 556 pxor \TMP5, \TMP2 557 pxor \TMP2, \GH 558 pxor \TMP1, \GH # result is in TMP1 559.endm 560 561# Reads DLEN bytes starting at DPTR and stores in XMMDst 562# where 0 < DLEN < 16 563# Clobbers %rax, DLEN and XMM1 564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 565 cmp $8, \DLEN 566 jl .L_read_lt8_\@ 567 mov (\DPTR), %rax 568 movq %rax, \XMMDst 569 sub $8, \DLEN 570 jz .L_done_read_partial_block_\@ 571 xor %eax, %eax 572.L_read_next_byte_\@: 573 shl $8, %rax 574 mov 7(\DPTR, \DLEN, 1), %al 575 dec \DLEN 576 jnz .L_read_next_byte_\@ 577 movq %rax, \XMM1 578 pslldq $8, \XMM1 579 por \XMM1, \XMMDst 580 jmp .L_done_read_partial_block_\@ 581.L_read_lt8_\@: 582 xor %eax, %eax 583.L_read_next_byte_lt8_\@: 584 shl $8, %rax 585 mov -1(\DPTR, \DLEN, 1), %al 586 dec \DLEN 587 jnz .L_read_next_byte_lt8_\@ 588 movq %rax, \XMMDst 589.L_done_read_partial_block_\@: 590.endm 591 592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 593# clobbers r10-11, xmm14 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 595 TMP6 TMP7 596 MOVADQ SHUF_MASK(%rip), %xmm14 597 mov \AAD, %r10 # %r10 = AAD 598 mov \AADLEN, %r11 # %r11 = aadLen 599 pxor \TMP7, \TMP7 600 pxor \TMP6, \TMP6 601 602 cmp $16, %r11 603 jl .L_get_AAD_rest\@ 604.L_get_AAD_blocks\@: 605 movdqu (%r10), \TMP7 606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 607 pxor \TMP7, \TMP6 608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 609 add $16, %r10 610 sub $16, %r11 611 cmp $16, %r11 612 jge .L_get_AAD_blocks\@ 613 614 movdqu \TMP6, \TMP7 615 616 /* read the last <16B of AAD */ 617.L_get_AAD_rest\@: 618 test %r11, %r11 619 je .L_get_AAD_done\@ 620 621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 623 pxor \TMP6, \TMP7 624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 625 movdqu \TMP7, \TMP6 626 627.L_get_AAD_done\@: 628 movdqu \TMP6, AadHash(%arg2) 629.endm 630 631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 632# between update calls. 633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 637 AAD_HASH operation 638 mov PBlockLen(%arg2), %r13 639 test %r13, %r13 640 je .L_partial_block_done_\@ # Leave Macro if no partial blocks 641 # Read in input data without over reading 642 cmp $16, \PLAIN_CYPH_LEN 643 jl .L_fewer_than_16_bytes_\@ 644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 645 jmp .L_data_read_\@ 646 647.L_fewer_than_16_bytes_\@: 648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 649 mov \PLAIN_CYPH_LEN, %r12 650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 651 652 mov PBlockLen(%arg2), %r13 653 654.L_data_read_\@: # Finished reading in data 655 656 movdqu PBlockEncKey(%arg2), %xmm9 657 movdqu HashKey(%arg2), %xmm13 658 659 lea SHIFT_MASK(%rip), %r12 660 661 # adjust the shuffle mask pointer to be able to shift r13 bytes 662 # r16-r13 is the number of bytes in plaintext mod 16) 663 add %r13, %r12 664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 665 pshufb %xmm2, %xmm9 # shift right r13 bytes 666 667.ifc \operation, dec 668 movdqa %xmm1, %xmm3 669 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 670 671 mov \PLAIN_CYPH_LEN, %r10 672 add %r13, %r10 673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 674 sub $16, %r10 675 # Determine if if partial block is not being filled and 676 # shift mask accordingly 677 jge .L_no_extra_mask_1_\@ 678 sub %r10, %r12 679.L_no_extra_mask_1_\@: 680 681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 682 # get the appropriate mask to mask out bottom r13 bytes of xmm9 683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 684 685 pand %xmm1, %xmm3 686 movdqa SHUF_MASK(%rip), %xmm10 687 pshufb %xmm10, %xmm3 688 pshufb %xmm2, %xmm3 689 pxor %xmm3, \AAD_HASH 690 691 test %r10, %r10 692 jl .L_partial_incomplete_1_\@ 693 694 # GHASH computation for the last <16 Byte block 695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 696 xor %eax, %eax 697 698 mov %rax, PBlockLen(%arg2) 699 jmp .L_dec_done_\@ 700.L_partial_incomplete_1_\@: 701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 702.L_dec_done_\@: 703 movdqu \AAD_HASH, AadHash(%arg2) 704.else 705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 706 707 mov \PLAIN_CYPH_LEN, %r10 708 add %r13, %r10 709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 710 sub $16, %r10 711 # Determine if if partial block is not being filled and 712 # shift mask accordingly 713 jge .L_no_extra_mask_2_\@ 714 sub %r10, %r12 715.L_no_extra_mask_2_\@: 716 717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 718 # get the appropriate mask to mask out bottom r13 bytes of xmm9 719 pand %xmm1, %xmm9 720 721 movdqa SHUF_MASK(%rip), %xmm1 722 pshufb %xmm1, %xmm9 723 pshufb %xmm2, %xmm9 724 pxor %xmm9, \AAD_HASH 725 726 test %r10, %r10 727 jl .L_partial_incomplete_2_\@ 728 729 # GHASH computation for the last <16 Byte block 730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 731 xor %eax, %eax 732 733 mov %rax, PBlockLen(%arg2) 734 jmp .L_encode_done_\@ 735.L_partial_incomplete_2_\@: 736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 737.L_encode_done_\@: 738 movdqu \AAD_HASH, AadHash(%arg2) 739 740 movdqa SHUF_MASK(%rip), %xmm10 741 # shuffle xmm9 back to output as ciphertext 742 pshufb %xmm10, %xmm9 743 pshufb %xmm2, %xmm9 744.endif 745 # output encrypted Bytes 746 test %r10, %r10 747 jl .L_partial_fill_\@ 748 mov %r13, %r12 749 mov $16, %r13 750 # Set r13 to be the number of bytes to write out 751 sub %r12, %r13 752 jmp .L_count_set_\@ 753.L_partial_fill_\@: 754 mov \PLAIN_CYPH_LEN, %r13 755.L_count_set_\@: 756 movdqa %xmm9, %xmm0 757 movq %xmm0, %rax 758 cmp $8, %r13 759 jle .L_less_than_8_bytes_left_\@ 760 761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 762 add $8, \DATA_OFFSET 763 psrldq $8, %xmm0 764 movq %xmm0, %rax 765 sub $8, %r13 766.L_less_than_8_bytes_left_\@: 767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 768 add $1, \DATA_OFFSET 769 shr $8, %rax 770 sub $1, %r13 771 jne .L_less_than_8_bytes_left_\@ 772.L_partial_block_done_\@: 773.endm # PARTIAL_BLOCK 774 775/* 776* if a = number of total plaintext bytes 777* b = floor(a/16) 778* num_initial_blocks = b mod 4 779* encrypt the initial num_initial_blocks blocks and apply ghash on 780* the ciphertext 781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 782* are clobbered 783* arg1, %arg2, %arg3 are used as a pointer only, not modified 784*/ 785 786 787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 789 MOVADQ SHUF_MASK(%rip), %xmm14 790 791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 792 793 # start AES for num_initial_blocks blocks 794 795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 796 797.if (\i == 5) || (\i == 6) || (\i == 7) 798 799 MOVADQ ONE(%RIP),\TMP1 800 MOVADQ 0(%arg1),\TMP2 801.irpc index, \i_seq 802 paddd \TMP1, \XMM0 # INCR Y0 803.ifc \operation, dec 804 movdqa \XMM0, %xmm\index 805.else 806 MOVADQ \XMM0, %xmm\index 807.endif 808 pshufb %xmm14, %xmm\index # perform a 16 byte swap 809 pxor \TMP2, %xmm\index 810.endr 811 lea 0x10(%arg1),%r10 812 mov keysize,%eax 813 shr $2,%eax # 128->4, 192->6, 256->8 814 add $5,%eax # 128->9, 192->11, 256->13 815 816.Laes_loop_initial_\@: 817 MOVADQ (%r10),\TMP1 818.irpc index, \i_seq 819 aesenc \TMP1, %xmm\index 820.endr 821 add $16,%r10 822 sub $1,%eax 823 jnz .Laes_loop_initial_\@ 824 825 MOVADQ (%r10), \TMP1 826.irpc index, \i_seq 827 aesenclast \TMP1, %xmm\index # Last Round 828.endr 829.irpc index, \i_seq 830 movdqu (%arg4 , %r11, 1), \TMP1 831 pxor \TMP1, %xmm\index 832 movdqu %xmm\index, (%arg3 , %r11, 1) 833 # write back plaintext/ciphertext for num_initial_blocks 834 add $16, %r11 835 836.ifc \operation, dec 837 movdqa \TMP1, %xmm\index 838.endif 839 pshufb %xmm14, %xmm\index 840 841 # prepare plaintext/ciphertext for GHASH computation 842.endr 843.endif 844 845 # apply GHASH on num_initial_blocks blocks 846 847.if \i == 5 848 pxor %xmm5, %xmm6 849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 850 pxor %xmm6, %xmm7 851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 852 pxor %xmm7, %xmm8 853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 854.elseif \i == 6 855 pxor %xmm6, %xmm7 856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 857 pxor %xmm7, %xmm8 858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 859.elseif \i == 7 860 pxor %xmm7, %xmm8 861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 862.endif 863 cmp $64, %r13 864 jl .L_initial_blocks_done\@ 865 # no need for precomputed values 866/* 867* 868* Precomputations for HashKey parallel with encryption of first 4 blocks. 869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 870*/ 871 MOVADQ ONE(%RIP),\TMP1 872 paddd \TMP1, \XMM0 # INCR Y0 873 MOVADQ \XMM0, \XMM1 874 pshufb %xmm14, \XMM1 # perform a 16 byte swap 875 876 paddd \TMP1, \XMM0 # INCR Y0 877 MOVADQ \XMM0, \XMM2 878 pshufb %xmm14, \XMM2 # perform a 16 byte swap 879 880 paddd \TMP1, \XMM0 # INCR Y0 881 MOVADQ \XMM0, \XMM3 882 pshufb %xmm14, \XMM3 # perform a 16 byte swap 883 884 paddd \TMP1, \XMM0 # INCR Y0 885 MOVADQ \XMM0, \XMM4 886 pshufb %xmm14, \XMM4 # perform a 16 byte swap 887 888 MOVADQ 0(%arg1),\TMP1 889 pxor \TMP1, \XMM1 890 pxor \TMP1, \XMM2 891 pxor \TMP1, \XMM3 892 pxor \TMP1, \XMM4 893.irpc index, 1234 # do 4 rounds 894 movaps 0x10*\index(%arg1), \TMP1 895 aesenc \TMP1, \XMM1 896 aesenc \TMP1, \XMM2 897 aesenc \TMP1, \XMM3 898 aesenc \TMP1, \XMM4 899.endr 900.irpc index, 56789 # do next 5 rounds 901 movaps 0x10*\index(%arg1), \TMP1 902 aesenc \TMP1, \XMM1 903 aesenc \TMP1, \XMM2 904 aesenc \TMP1, \XMM3 905 aesenc \TMP1, \XMM4 906.endr 907 lea 0xa0(%arg1),%r10 908 mov keysize,%eax 909 shr $2,%eax # 128->4, 192->6, 256->8 910 sub $4,%eax # 128->0, 192->2, 256->4 911 jz .Laes_loop_pre_done\@ 912 913.Laes_loop_pre_\@: 914 MOVADQ (%r10),\TMP2 915.irpc index, 1234 916 aesenc \TMP2, %xmm\index 917.endr 918 add $16,%r10 919 sub $1,%eax 920 jnz .Laes_loop_pre_\@ 921 922.Laes_loop_pre_done\@: 923 MOVADQ (%r10), \TMP2 924 aesenclast \TMP2, \XMM1 925 aesenclast \TMP2, \XMM2 926 aesenclast \TMP2, \XMM3 927 aesenclast \TMP2, \XMM4 928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 929 pxor \TMP1, \XMM1 930.ifc \operation, dec 931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 932 movdqa \TMP1, \XMM1 933.endif 934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 935 pxor \TMP1, \XMM2 936.ifc \operation, dec 937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 938 movdqa \TMP1, \XMM2 939.endif 940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 941 pxor \TMP1, \XMM3 942.ifc \operation, dec 943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 944 movdqa \TMP1, \XMM3 945.endif 946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 947 pxor \TMP1, \XMM4 948.ifc \operation, dec 949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 950 movdqa \TMP1, \XMM4 951.else 952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 956.endif 957 958 add $64, %r11 959 pshufb %xmm14, \XMM1 # perform a 16 byte swap 960 pxor \XMMDst, \XMM1 961# combine GHASHed value with the corresponding ciphertext 962 pshufb %xmm14, \XMM2 # perform a 16 byte swap 963 pshufb %xmm14, \XMM3 # perform a 16 byte swap 964 pshufb %xmm14, \XMM4 # perform a 16 byte swap 965 966.L_initial_blocks_done\@: 967 968.endm 969 970/* 971* encrypt 4 blocks at a time 972* ghash the 4 previously encrypted ciphertext blocks 973* arg1, %arg3, %arg4 are used as pointers only, not modified 974* %r11 is the data offset value 975*/ 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 978 979 movdqa \XMM1, \XMM5 980 movdqa \XMM2, \XMM6 981 movdqa \XMM3, \XMM7 982 movdqa \XMM4, \XMM8 983 984 movdqa SHUF_MASK(%rip), %xmm15 985 # multiply TMP5 * HashKey using karatsuba 986 987 movdqa \XMM5, \TMP4 988 pshufd $78, \XMM5, \TMP6 989 pxor \XMM5, \TMP6 990 paddd ONE(%rip), \XMM0 # INCR CNT 991 movdqu HashKey_4(%arg2), \TMP5 992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 993 movdqa \XMM0, \XMM1 994 paddd ONE(%rip), \XMM0 # INCR CNT 995 movdqa \XMM0, \XMM2 996 paddd ONE(%rip), \XMM0 # INCR CNT 997 movdqa \XMM0, \XMM3 998 paddd ONE(%rip), \XMM0 # INCR CNT 999 movdqa \XMM0, \XMM4 1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1005 1006 pxor (%arg1), \XMM1 1007 pxor (%arg1), \XMM2 1008 pxor (%arg1), \XMM3 1009 pxor (%arg1), \XMM4 1010 movdqu HashKey_4_k(%arg2), \TMP5 1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1012 movaps 0x10(%arg1), \TMP1 1013 aesenc \TMP1, \XMM1 # Round 1 1014 aesenc \TMP1, \XMM2 1015 aesenc \TMP1, \XMM3 1016 aesenc \TMP1, \XMM4 1017 movaps 0x20(%arg1), \TMP1 1018 aesenc \TMP1, \XMM1 # Round 2 1019 aesenc \TMP1, \XMM2 1020 aesenc \TMP1, \XMM3 1021 aesenc \TMP1, \XMM4 1022 movdqa \XMM6, \TMP1 1023 pshufd $78, \XMM6, \TMP2 1024 pxor \XMM6, \TMP2 1025 movdqu HashKey_3(%arg2), \TMP5 1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1027 movaps 0x30(%arg1), \TMP3 1028 aesenc \TMP3, \XMM1 # Round 3 1029 aesenc \TMP3, \XMM2 1030 aesenc \TMP3, \XMM3 1031 aesenc \TMP3, \XMM4 1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1033 movaps 0x40(%arg1), \TMP3 1034 aesenc \TMP3, \XMM1 # Round 4 1035 aesenc \TMP3, \XMM2 1036 aesenc \TMP3, \XMM3 1037 aesenc \TMP3, \XMM4 1038 movdqu HashKey_3_k(%arg2), \TMP5 1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1040 movaps 0x50(%arg1), \TMP3 1041 aesenc \TMP3, \XMM1 # Round 5 1042 aesenc \TMP3, \XMM2 1043 aesenc \TMP3, \XMM3 1044 aesenc \TMP3, \XMM4 1045 pxor \TMP1, \TMP4 1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1047 pxor \XMM6, \XMM5 1048 pxor \TMP2, \TMP6 1049 movdqa \XMM7, \TMP1 1050 pshufd $78, \XMM7, \TMP2 1051 pxor \XMM7, \TMP2 1052 movdqu HashKey_2(%arg2), \TMP5 1053 1054 # Multiply TMP5 * HashKey using karatsuba 1055 1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1057 movaps 0x60(%arg1), \TMP3 1058 aesenc \TMP3, \XMM1 # Round 6 1059 aesenc \TMP3, \XMM2 1060 aesenc \TMP3, \XMM3 1061 aesenc \TMP3, \XMM4 1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1063 movaps 0x70(%arg1), \TMP3 1064 aesenc \TMP3, \XMM1 # Round 7 1065 aesenc \TMP3, \XMM2 1066 aesenc \TMP3, \XMM3 1067 aesenc \TMP3, \XMM4 1068 movdqu HashKey_2_k(%arg2), \TMP5 1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1070 movaps 0x80(%arg1), \TMP3 1071 aesenc \TMP3, \XMM1 # Round 8 1072 aesenc \TMP3, \XMM2 1073 aesenc \TMP3, \XMM3 1074 aesenc \TMP3, \XMM4 1075 pxor \TMP1, \TMP4 1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1077 pxor \XMM7, \XMM5 1078 pxor \TMP2, \TMP6 1079 1080 # Multiply XMM8 * HashKey 1081 # XMM8 and TMP5 hold the values for the two operands 1082 1083 movdqa \XMM8, \TMP1 1084 pshufd $78, \XMM8, \TMP2 1085 pxor \XMM8, \TMP2 1086 movdqu HashKey(%arg2), \TMP5 1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1088 movaps 0x90(%arg1), \TMP3 1089 aesenc \TMP3, \XMM1 # Round 9 1090 aesenc \TMP3, \XMM2 1091 aesenc \TMP3, \XMM3 1092 aesenc \TMP3, \XMM4 1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1094 lea 0xa0(%arg1),%r10 1095 mov keysize,%eax 1096 shr $2,%eax # 128->4, 192->6, 256->8 1097 sub $4,%eax # 128->0, 192->2, 256->4 1098 jz .Laes_loop_par_enc_done\@ 1099 1100.Laes_loop_par_enc\@: 1101 MOVADQ (%r10),\TMP3 1102.irpc index, 1234 1103 aesenc \TMP3, %xmm\index 1104.endr 1105 add $16,%r10 1106 sub $1,%eax 1107 jnz .Laes_loop_par_enc\@ 1108 1109.Laes_loop_par_enc_done\@: 1110 MOVADQ (%r10), \TMP3 1111 aesenclast \TMP3, \XMM1 # Round 10 1112 aesenclast \TMP3, \XMM2 1113 aesenclast \TMP3, \XMM3 1114 aesenclast \TMP3, \XMM4 1115 movdqu HashKey_k(%arg2), \TMP5 1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1117 movdqu (%arg4,%r11,1), \TMP3 1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1119 movdqu 16(%arg4,%r11,1), \TMP3 1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1121 movdqu 32(%arg4,%r11,1), \TMP3 1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1123 movdqu 48(%arg4,%r11,1), \TMP3 1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1133 1134 pxor \TMP4, \TMP1 1135 pxor \XMM8, \XMM5 1136 pxor \TMP6, \TMP2 1137 pxor \TMP1, \TMP2 1138 pxor \XMM5, \TMP2 1139 movdqa \TMP2, \TMP3 1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1142 pxor \TMP3, \XMM5 1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1144 1145 # first phase of reduction 1146 1147 movdqa \XMM5, \TMP2 1148 movdqa \XMM5, \TMP3 1149 movdqa \XMM5, \TMP4 1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1151 pslld $31, \TMP2 # packed right shift << 31 1152 pslld $30, \TMP3 # packed right shift << 30 1153 pslld $25, \TMP4 # packed right shift << 25 1154 pxor \TMP3, \TMP2 # xor the shifted versions 1155 pxor \TMP4, \TMP2 1156 movdqa \TMP2, \TMP5 1157 psrldq $4, \TMP5 # right shift T5 1 DW 1158 pslldq $12, \TMP2 # left shift T2 3 DWs 1159 pxor \TMP2, \XMM5 1160 1161 # second phase of reduction 1162 1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1164 movdqa \XMM5,\TMP3 1165 movdqa \XMM5,\TMP4 1166 psrld $1, \TMP2 # packed left shift >>1 1167 psrld $2, \TMP3 # packed left shift >>2 1168 psrld $7, \TMP4 # packed left shift >>7 1169 pxor \TMP3,\TMP2 # xor the shifted versions 1170 pxor \TMP4,\TMP2 1171 pxor \TMP5, \TMP2 1172 pxor \TMP2, \XMM5 1173 pxor \TMP1, \XMM5 # result is in TMP1 1174 1175 pxor \XMM5, \XMM1 1176.endm 1177 1178/* 1179* decrypt 4 blocks at a time 1180* ghash the 4 previously decrypted ciphertext blocks 1181* arg1, %arg3, %arg4 are used as pointers only, not modified 1182* %r11 is the data offset value 1183*/ 1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1186 1187 movdqa \XMM1, \XMM5 1188 movdqa \XMM2, \XMM6 1189 movdqa \XMM3, \XMM7 1190 movdqa \XMM4, \XMM8 1191 1192 movdqa SHUF_MASK(%rip), %xmm15 1193 # multiply TMP5 * HashKey using karatsuba 1194 1195 movdqa \XMM5, \TMP4 1196 pshufd $78, \XMM5, \TMP6 1197 pxor \XMM5, \TMP6 1198 paddd ONE(%rip), \XMM0 # INCR CNT 1199 movdqu HashKey_4(%arg2), \TMP5 1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1201 movdqa \XMM0, \XMM1 1202 paddd ONE(%rip), \XMM0 # INCR CNT 1203 movdqa \XMM0, \XMM2 1204 paddd ONE(%rip), \XMM0 # INCR CNT 1205 movdqa \XMM0, \XMM3 1206 paddd ONE(%rip), \XMM0 # INCR CNT 1207 movdqa \XMM0, \XMM4 1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1213 1214 pxor (%arg1), \XMM1 1215 pxor (%arg1), \XMM2 1216 pxor (%arg1), \XMM3 1217 pxor (%arg1), \XMM4 1218 movdqu HashKey_4_k(%arg2), \TMP5 1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1220 movaps 0x10(%arg1), \TMP1 1221 aesenc \TMP1, \XMM1 # Round 1 1222 aesenc \TMP1, \XMM2 1223 aesenc \TMP1, \XMM3 1224 aesenc \TMP1, \XMM4 1225 movaps 0x20(%arg1), \TMP1 1226 aesenc \TMP1, \XMM1 # Round 2 1227 aesenc \TMP1, \XMM2 1228 aesenc \TMP1, \XMM3 1229 aesenc \TMP1, \XMM4 1230 movdqa \XMM6, \TMP1 1231 pshufd $78, \XMM6, \TMP2 1232 pxor \XMM6, \TMP2 1233 movdqu HashKey_3(%arg2), \TMP5 1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1235 movaps 0x30(%arg1), \TMP3 1236 aesenc \TMP3, \XMM1 # Round 3 1237 aesenc \TMP3, \XMM2 1238 aesenc \TMP3, \XMM3 1239 aesenc \TMP3, \XMM4 1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1241 movaps 0x40(%arg1), \TMP3 1242 aesenc \TMP3, \XMM1 # Round 4 1243 aesenc \TMP3, \XMM2 1244 aesenc \TMP3, \XMM3 1245 aesenc \TMP3, \XMM4 1246 movdqu HashKey_3_k(%arg2), \TMP5 1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1248 movaps 0x50(%arg1), \TMP3 1249 aesenc \TMP3, \XMM1 # Round 5 1250 aesenc \TMP3, \XMM2 1251 aesenc \TMP3, \XMM3 1252 aesenc \TMP3, \XMM4 1253 pxor \TMP1, \TMP4 1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1255 pxor \XMM6, \XMM5 1256 pxor \TMP2, \TMP6 1257 movdqa \XMM7, \TMP1 1258 pshufd $78, \XMM7, \TMP2 1259 pxor \XMM7, \TMP2 1260 movdqu HashKey_2(%arg2), \TMP5 1261 1262 # Multiply TMP5 * HashKey using karatsuba 1263 1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1265 movaps 0x60(%arg1), \TMP3 1266 aesenc \TMP3, \XMM1 # Round 6 1267 aesenc \TMP3, \XMM2 1268 aesenc \TMP3, \XMM3 1269 aesenc \TMP3, \XMM4 1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1271 movaps 0x70(%arg1), \TMP3 1272 aesenc \TMP3, \XMM1 # Round 7 1273 aesenc \TMP3, \XMM2 1274 aesenc \TMP3, \XMM3 1275 aesenc \TMP3, \XMM4 1276 movdqu HashKey_2_k(%arg2), \TMP5 1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1278 movaps 0x80(%arg1), \TMP3 1279 aesenc \TMP3, \XMM1 # Round 8 1280 aesenc \TMP3, \XMM2 1281 aesenc \TMP3, \XMM3 1282 aesenc \TMP3, \XMM4 1283 pxor \TMP1, \TMP4 1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1285 pxor \XMM7, \XMM5 1286 pxor \TMP2, \TMP6 1287 1288 # Multiply XMM8 * HashKey 1289 # XMM8 and TMP5 hold the values for the two operands 1290 1291 movdqa \XMM8, \TMP1 1292 pshufd $78, \XMM8, \TMP2 1293 pxor \XMM8, \TMP2 1294 movdqu HashKey(%arg2), \TMP5 1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1296 movaps 0x90(%arg1), \TMP3 1297 aesenc \TMP3, \XMM1 # Round 9 1298 aesenc \TMP3, \XMM2 1299 aesenc \TMP3, \XMM3 1300 aesenc \TMP3, \XMM4 1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1302 lea 0xa0(%arg1),%r10 1303 mov keysize,%eax 1304 shr $2,%eax # 128->4, 192->6, 256->8 1305 sub $4,%eax # 128->0, 192->2, 256->4 1306 jz .Laes_loop_par_dec_done\@ 1307 1308.Laes_loop_par_dec\@: 1309 MOVADQ (%r10),\TMP3 1310.irpc index, 1234 1311 aesenc \TMP3, %xmm\index 1312.endr 1313 add $16,%r10 1314 sub $1,%eax 1315 jnz .Laes_loop_par_dec\@ 1316 1317.Laes_loop_par_dec_done\@: 1318 MOVADQ (%r10), \TMP3 1319 aesenclast \TMP3, \XMM1 # last round 1320 aesenclast \TMP3, \XMM2 1321 aesenclast \TMP3, \XMM3 1322 aesenclast \TMP3, \XMM4 1323 movdqu HashKey_k(%arg2), \TMP5 1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1325 movdqu (%arg4,%r11,1), \TMP3 1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1328 movdqa \TMP3, \XMM1 1329 movdqu 16(%arg4,%r11,1), \TMP3 1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1332 movdqa \TMP3, \XMM2 1333 movdqu 32(%arg4,%r11,1), \TMP3 1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1336 movdqa \TMP3, \XMM3 1337 movdqu 48(%arg4,%r11,1), \TMP3 1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1340 movdqa \TMP3, \XMM4 1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1345 1346 pxor \TMP4, \TMP1 1347 pxor \XMM8, \XMM5 1348 pxor \TMP6, \TMP2 1349 pxor \TMP1, \TMP2 1350 pxor \XMM5, \TMP2 1351 movdqa \TMP2, \TMP3 1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1354 pxor \TMP3, \XMM5 1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1356 1357 # first phase of reduction 1358 1359 movdqa \XMM5, \TMP2 1360 movdqa \XMM5, \TMP3 1361 movdqa \XMM5, \TMP4 1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1363 pslld $31, \TMP2 # packed right shift << 31 1364 pslld $30, \TMP3 # packed right shift << 30 1365 pslld $25, \TMP4 # packed right shift << 25 1366 pxor \TMP3, \TMP2 # xor the shifted versions 1367 pxor \TMP4, \TMP2 1368 movdqa \TMP2, \TMP5 1369 psrldq $4, \TMP5 # right shift T5 1 DW 1370 pslldq $12, \TMP2 # left shift T2 3 DWs 1371 pxor \TMP2, \XMM5 1372 1373 # second phase of reduction 1374 1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1376 movdqa \XMM5,\TMP3 1377 movdqa \XMM5,\TMP4 1378 psrld $1, \TMP2 # packed left shift >>1 1379 psrld $2, \TMP3 # packed left shift >>2 1380 psrld $7, \TMP4 # packed left shift >>7 1381 pxor \TMP3,\TMP2 # xor the shifted versions 1382 pxor \TMP4,\TMP2 1383 pxor \TMP5, \TMP2 1384 pxor \TMP2, \XMM5 1385 pxor \TMP1, \XMM5 # result is in TMP1 1386 1387 pxor \XMM5, \XMM1 1388.endm 1389 1390/* GHASH the last 4 ciphertext blocks. */ 1391.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1393 1394 # Multiply TMP6 * HashKey (using Karatsuba) 1395 1396 movdqa \XMM1, \TMP6 1397 pshufd $78, \XMM1, \TMP2 1398 pxor \XMM1, \TMP2 1399 movdqu HashKey_4(%arg2), \TMP5 1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1402 movdqu HashKey_4_k(%arg2), \TMP4 1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1404 movdqa \XMM1, \XMMDst 1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1406 1407 # Multiply TMP1 * HashKey (using Karatsuba) 1408 1409 movdqa \XMM2, \TMP1 1410 pshufd $78, \XMM2, \TMP2 1411 pxor \XMM2, \TMP2 1412 movdqu HashKey_3(%arg2), \TMP5 1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1415 movdqu HashKey_3_k(%arg2), \TMP4 1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1417 pxor \TMP1, \TMP6 1418 pxor \XMM2, \XMMDst 1419 pxor \TMP2, \XMM1 1420# results accumulated in TMP6, XMMDst, XMM1 1421 1422 # Multiply TMP1 * HashKey (using Karatsuba) 1423 1424 movdqa \XMM3, \TMP1 1425 pshufd $78, \XMM3, \TMP2 1426 pxor \XMM3, \TMP2 1427 movdqu HashKey_2(%arg2), \TMP5 1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1430 movdqu HashKey_2_k(%arg2), \TMP4 1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1432 pxor \TMP1, \TMP6 1433 pxor \XMM3, \XMMDst 1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1435 1436 # Multiply TMP1 * HashKey (using Karatsuba) 1437 movdqa \XMM4, \TMP1 1438 pshufd $78, \XMM4, \TMP2 1439 pxor \XMM4, \TMP2 1440 movdqu HashKey(%arg2), \TMP5 1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1443 movdqu HashKey_k(%arg2), \TMP4 1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1445 pxor \TMP1, \TMP6 1446 pxor \XMM4, \XMMDst 1447 pxor \XMM1, \TMP2 1448 pxor \TMP6, \TMP2 1449 pxor \XMMDst, \TMP2 1450 # middle section of the temp results combined as in karatsuba algorithm 1451 movdqa \TMP2, \TMP4 1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1454 pxor \TMP4, \XMMDst 1455 pxor \TMP2, \TMP6 1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1457 # first phase of the reduction 1458 movdqa \XMMDst, \TMP2 1459 movdqa \XMMDst, \TMP3 1460 movdqa \XMMDst, \TMP4 1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1462 pslld $31, \TMP2 # packed right shifting << 31 1463 pslld $30, \TMP3 # packed right shifting << 30 1464 pslld $25, \TMP4 # packed right shifting << 25 1465 pxor \TMP3, \TMP2 # xor the shifted versions 1466 pxor \TMP4, \TMP2 1467 movdqa \TMP2, \TMP7 1468 psrldq $4, \TMP7 # right shift TMP7 1 DW 1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1470 pxor \TMP2, \XMMDst 1471 1472 # second phase of the reduction 1473 movdqa \XMMDst, \TMP2 1474 # make 3 copies of XMMDst for doing 3 shift operations 1475 movdqa \XMMDst, \TMP3 1476 movdqa \XMMDst, \TMP4 1477 psrld $1, \TMP2 # packed left shift >> 1 1478 psrld $2, \TMP3 # packed left shift >> 2 1479 psrld $7, \TMP4 # packed left shift >> 7 1480 pxor \TMP3, \TMP2 # xor the shifted versions 1481 pxor \TMP4, \TMP2 1482 pxor \TMP7, \TMP2 1483 pxor \TMP2, \XMMDst 1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1485.endm 1486 1487 1488/* Encryption of a single block 1489* uses eax & r10 1490*/ 1491 1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1493 1494 pxor (%arg1), \XMM0 1495 mov keysize,%eax 1496 shr $2,%eax # 128->4, 192->6, 256->8 1497 add $5,%eax # 128->9, 192->11, 256->13 1498 lea 16(%arg1), %r10 # get first expanded key address 1499 1500_esb_loop_\@: 1501 MOVADQ (%r10),\TMP1 1502 aesenc \TMP1,\XMM0 1503 add $16,%r10 1504 sub $1,%eax 1505 jnz _esb_loop_\@ 1506 1507 MOVADQ (%r10),\TMP1 1508 aesenclast \TMP1,\XMM0 1509.endm 1510/***************************************************************************** 1511* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1512* struct gcm_context_data *data 1513* // Context data 1514* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1515* const u8 *in, // Ciphertext input 1516* u64 plaintext_len, // Length of data in bytes for decryption. 1517* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1518* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1519* // concatenated with 0x00000001. 16-byte aligned pointer. 1520* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1521* const u8 *aad, // Additional Authentication Data (AAD) 1522* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1523* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1524* // given authentication tag and only return the plaintext if they match. 1525* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1526* // (most likely), 12 or 8. 1527* 1528* Assumptions: 1529* 1530* keys: 1531* keys are pre-expanded and aligned to 16 bytes. we are using the first 1532* set of 11 keys in the data structure void *aes_ctx 1533* 1534* iv: 1535* 0 1 2 3 1536* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1537* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1538* | Salt (From the SA) | 1539* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1540* | Initialization Vector | 1541* | (This is the sequence number from IPSec header) | 1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1543* | 0x1 | 1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1545* 1546* 1547* 1548* AAD: 1549* AAD padded to 128 bits with 0 1550* for example, assume AAD is a u32 vector 1551* 1552* if AAD is 8 bytes: 1553* AAD[3] = {A0, A1}; 1554* padded AAD in xmm register = {A1 A0 0 0} 1555* 1556* 0 1 2 3 1557* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1558* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1559* | SPI (A1) | 1560* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1561* | 32-bit Sequence Number (A0) | 1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1563* | 0x0 | 1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1565* 1566* AAD Format with 32-bit Sequence Number 1567* 1568* if AAD is 12 bytes: 1569* AAD[3] = {A0, A1, A2}; 1570* padded AAD in xmm register = {A2 A1 A0 0} 1571* 1572* 0 1 2 3 1573* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1574* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1575* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1576* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1577* | SPI (A2) | 1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1579* | 64-bit Extended Sequence Number {A1,A0} | 1580* | | 1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1582* | 0x0 | 1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1584* 1585* AAD Format with 64-bit Extended Sequence Number 1586* 1587* poly = x^128 + x^127 + x^126 + x^121 + 1 1588* 1589*****************************************************************************/ 1590SYM_FUNC_START(aesni_gcm_dec) 1591 FUNC_SAVE 1592 1593 GCM_INIT %arg6, arg7, arg8, arg9 1594 GCM_ENC_DEC dec 1595 GCM_COMPLETE arg10, arg11 1596 FUNC_RESTORE 1597 RET 1598SYM_FUNC_END(aesni_gcm_dec) 1599 1600 1601/***************************************************************************** 1602* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1603* struct gcm_context_data *data 1604* // Context data 1605* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1606* const u8 *in, // Plaintext input 1607* u64 plaintext_len, // Length of data in bytes for encryption. 1608* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1609* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1610* // concatenated with 0x00000001. 16-byte aligned pointer. 1611* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1612* const u8 *aad, // Additional Authentication Data (AAD) 1613* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1614* u8 *auth_tag, // Authenticated Tag output. 1615* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1616* // 12 or 8. 1617* 1618* Assumptions: 1619* 1620* keys: 1621* keys are pre-expanded and aligned to 16 bytes. we are using the 1622* first set of 11 keys in the data structure void *aes_ctx 1623* 1624* 1625* iv: 1626* 0 1 2 3 1627* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1628* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1629* | Salt (From the SA) | 1630* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1631* | Initialization Vector | 1632* | (This is the sequence number from IPSec header) | 1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1634* | 0x1 | 1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1636* 1637* 1638* 1639* AAD: 1640* AAD padded to 128 bits with 0 1641* for example, assume AAD is a u32 vector 1642* 1643* if AAD is 8 bytes: 1644* AAD[3] = {A0, A1}; 1645* padded AAD in xmm register = {A1 A0 0 0} 1646* 1647* 0 1 2 3 1648* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1649* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1650* | SPI (A1) | 1651* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1652* | 32-bit Sequence Number (A0) | 1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1654* | 0x0 | 1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1656* 1657* AAD Format with 32-bit Sequence Number 1658* 1659* if AAD is 12 bytes: 1660* AAD[3] = {A0, A1, A2}; 1661* padded AAD in xmm register = {A2 A1 A0 0} 1662* 1663* 0 1 2 3 1664* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1665* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1666* | SPI (A2) | 1667* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1668* | 64-bit Extended Sequence Number {A1,A0} | 1669* | | 1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1671* | 0x0 | 1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1673* 1674* AAD Format with 64-bit Extended Sequence Number 1675* 1676* poly = x^128 + x^127 + x^126 + x^121 + 1 1677***************************************************************************/ 1678SYM_FUNC_START(aesni_gcm_enc) 1679 FUNC_SAVE 1680 1681 GCM_INIT %arg6, arg7, arg8, arg9 1682 GCM_ENC_DEC enc 1683 1684 GCM_COMPLETE arg10, arg11 1685 FUNC_RESTORE 1686 RET 1687SYM_FUNC_END(aesni_gcm_enc) 1688 1689/***************************************************************************** 1690* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1691* struct gcm_context_data *data, 1692* // context data 1693* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1694* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1695* // concatenated with 0x00000001. 16-byte aligned pointer. 1696* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1697* const u8 *aad, // Additional Authentication Data (AAD) 1698* u64 aad_len) // Length of AAD in bytes. 1699*/ 1700SYM_FUNC_START(aesni_gcm_init) 1701 FUNC_SAVE 1702 GCM_INIT %arg3, %arg4,%arg5, %arg6 1703 FUNC_RESTORE 1704 RET 1705SYM_FUNC_END(aesni_gcm_init) 1706 1707/***************************************************************************** 1708* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1709* struct gcm_context_data *data, 1710* // context data 1711* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1712* const u8 *in, // Plaintext input 1713* u64 plaintext_len, // Length of data in bytes for encryption. 1714*/ 1715SYM_FUNC_START(aesni_gcm_enc_update) 1716 FUNC_SAVE 1717 GCM_ENC_DEC enc 1718 FUNC_RESTORE 1719 RET 1720SYM_FUNC_END(aesni_gcm_enc_update) 1721 1722/***************************************************************************** 1723* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1724* struct gcm_context_data *data, 1725* // context data 1726* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1727* const u8 *in, // Plaintext input 1728* u64 plaintext_len, // Length of data in bytes for encryption. 1729*/ 1730SYM_FUNC_START(aesni_gcm_dec_update) 1731 FUNC_SAVE 1732 GCM_ENC_DEC dec 1733 FUNC_RESTORE 1734 RET 1735SYM_FUNC_END(aesni_gcm_dec_update) 1736 1737/***************************************************************************** 1738* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1739* struct gcm_context_data *data, 1740* // context data 1741* u8 *auth_tag, // Authenticated Tag output. 1742* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1743* // 12 or 8. 1744*/ 1745SYM_FUNC_START(aesni_gcm_finalize) 1746 FUNC_SAVE 1747 GCM_COMPLETE %arg3 %arg4 1748 FUNC_RESTORE 1749 RET 1750SYM_FUNC_END(aesni_gcm_finalize) 1751 1752#endif 1753 1754SYM_FUNC_START_LOCAL(_key_expansion_256a) 1755 pshufd $0b11111111, %xmm1, %xmm1 1756 shufps $0b00010000, %xmm0, %xmm4 1757 pxor %xmm4, %xmm0 1758 shufps $0b10001100, %xmm0, %xmm4 1759 pxor %xmm4, %xmm0 1760 pxor %xmm1, %xmm0 1761 movaps %xmm0, (TKEYP) 1762 add $0x10, TKEYP 1763 RET 1764SYM_FUNC_END(_key_expansion_256a) 1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a) 1766 1767SYM_FUNC_START_LOCAL(_key_expansion_192a) 1768 pshufd $0b01010101, %xmm1, %xmm1 1769 shufps $0b00010000, %xmm0, %xmm4 1770 pxor %xmm4, %xmm0 1771 shufps $0b10001100, %xmm0, %xmm4 1772 pxor %xmm4, %xmm0 1773 pxor %xmm1, %xmm0 1774 1775 movaps %xmm2, %xmm5 1776 movaps %xmm2, %xmm6 1777 pslldq $4, %xmm5 1778 pshufd $0b11111111, %xmm0, %xmm3 1779 pxor %xmm3, %xmm2 1780 pxor %xmm5, %xmm2 1781 1782 movaps %xmm0, %xmm1 1783 shufps $0b01000100, %xmm0, %xmm6 1784 movaps %xmm6, (TKEYP) 1785 shufps $0b01001110, %xmm2, %xmm1 1786 movaps %xmm1, 0x10(TKEYP) 1787 add $0x20, TKEYP 1788 RET 1789SYM_FUNC_END(_key_expansion_192a) 1790 1791SYM_FUNC_START_LOCAL(_key_expansion_192b) 1792 pshufd $0b01010101, %xmm1, %xmm1 1793 shufps $0b00010000, %xmm0, %xmm4 1794 pxor %xmm4, %xmm0 1795 shufps $0b10001100, %xmm0, %xmm4 1796 pxor %xmm4, %xmm0 1797 pxor %xmm1, %xmm0 1798 1799 movaps %xmm2, %xmm5 1800 pslldq $4, %xmm5 1801 pshufd $0b11111111, %xmm0, %xmm3 1802 pxor %xmm3, %xmm2 1803 pxor %xmm5, %xmm2 1804 1805 movaps %xmm0, (TKEYP) 1806 add $0x10, TKEYP 1807 RET 1808SYM_FUNC_END(_key_expansion_192b) 1809 1810SYM_FUNC_START_LOCAL(_key_expansion_256b) 1811 pshufd $0b10101010, %xmm1, %xmm1 1812 shufps $0b00010000, %xmm2, %xmm4 1813 pxor %xmm4, %xmm2 1814 shufps $0b10001100, %xmm2, %xmm4 1815 pxor %xmm4, %xmm2 1816 pxor %xmm1, %xmm2 1817 movaps %xmm2, (TKEYP) 1818 add $0x10, TKEYP 1819 RET 1820SYM_FUNC_END(_key_expansion_256b) 1821 1822/* 1823 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1824 * unsigned int key_len) 1825 */ 1826SYM_FUNC_START(aesni_set_key) 1827 FRAME_BEGIN 1828#ifndef __x86_64__ 1829 pushl KEYP 1830 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1831 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1832 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1833#endif 1834 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1835 movaps %xmm0, (KEYP) 1836 lea 0x10(KEYP), TKEYP # key addr 1837 movl %edx, 480(KEYP) 1838 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1839 cmp $24, %dl 1840 jb .Lenc_key128 1841 je .Lenc_key192 1842 movups 0x10(UKEYP), %xmm2 # other user key 1843 movaps %xmm2, (TKEYP) 1844 add $0x10, TKEYP 1845 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1846 call _key_expansion_256a 1847 aeskeygenassist $0x1, %xmm0, %xmm1 1848 call _key_expansion_256b 1849 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1850 call _key_expansion_256a 1851 aeskeygenassist $0x2, %xmm0, %xmm1 1852 call _key_expansion_256b 1853 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1854 call _key_expansion_256a 1855 aeskeygenassist $0x4, %xmm0, %xmm1 1856 call _key_expansion_256b 1857 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1858 call _key_expansion_256a 1859 aeskeygenassist $0x8, %xmm0, %xmm1 1860 call _key_expansion_256b 1861 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1862 call _key_expansion_256a 1863 aeskeygenassist $0x10, %xmm0, %xmm1 1864 call _key_expansion_256b 1865 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1866 call _key_expansion_256a 1867 aeskeygenassist $0x20, %xmm0, %xmm1 1868 call _key_expansion_256b 1869 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1870 call _key_expansion_256a 1871 jmp .Ldec_key 1872.Lenc_key192: 1873 movq 0x10(UKEYP), %xmm2 # other user key 1874 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1875 call _key_expansion_192a 1876 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1877 call _key_expansion_192b 1878 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1879 call _key_expansion_192a 1880 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1881 call _key_expansion_192b 1882 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1883 call _key_expansion_192a 1884 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1885 call _key_expansion_192b 1886 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1887 call _key_expansion_192a 1888 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 1889 call _key_expansion_192b 1890 jmp .Ldec_key 1891.Lenc_key128: 1892 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 1893 call _key_expansion_128 1894 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 1895 call _key_expansion_128 1896 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 1897 call _key_expansion_128 1898 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 1899 call _key_expansion_128 1900 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 1901 call _key_expansion_128 1902 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 1903 call _key_expansion_128 1904 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 1905 call _key_expansion_128 1906 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 1907 call _key_expansion_128 1908 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 1909 call _key_expansion_128 1910 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 1911 call _key_expansion_128 1912.Ldec_key: 1913 sub $0x10, TKEYP 1914 movaps (KEYP), %xmm0 1915 movaps (TKEYP), %xmm1 1916 movaps %xmm0, 240(TKEYP) 1917 movaps %xmm1, 240(KEYP) 1918 add $0x10, KEYP 1919 lea 240-16(TKEYP), UKEYP 1920.align 4 1921.Ldec_key_loop: 1922 movaps (KEYP), %xmm0 1923 aesimc %xmm0, %xmm1 1924 movaps %xmm1, (UKEYP) 1925 add $0x10, KEYP 1926 sub $0x10, UKEYP 1927 cmp TKEYP, KEYP 1928 jb .Ldec_key_loop 1929 xor AREG, AREG 1930#ifndef __x86_64__ 1931 popl KEYP 1932#endif 1933 FRAME_END 1934 RET 1935SYM_FUNC_END(aesni_set_key) 1936 1937/* 1938 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1939 */ 1940SYM_FUNC_START(aesni_enc) 1941 FRAME_BEGIN 1942#ifndef __x86_64__ 1943 pushl KEYP 1944 pushl KLEN 1945 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1946 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1947 movl (FRAME_OFFSET+20)(%esp), INP # src 1948#endif 1949 movl 480(KEYP), KLEN # key length 1950 movups (INP), STATE # input 1951 call _aesni_enc1 1952 movups STATE, (OUTP) # output 1953#ifndef __x86_64__ 1954 popl KLEN 1955 popl KEYP 1956#endif 1957 FRAME_END 1958 RET 1959SYM_FUNC_END(aesni_enc) 1960 1961/* 1962 * _aesni_enc1: internal ABI 1963 * input: 1964 * KEYP: key struct pointer 1965 * KLEN: round count 1966 * STATE: initial state (input) 1967 * output: 1968 * STATE: finial state (output) 1969 * changed: 1970 * KEY 1971 * TKEYP (T1) 1972 */ 1973SYM_FUNC_START_LOCAL(_aesni_enc1) 1974 movaps (KEYP), KEY # key 1975 mov KEYP, TKEYP 1976 pxor KEY, STATE # round 0 1977 add $0x30, TKEYP 1978 cmp $24, KLEN 1979 jb .Lenc128 1980 lea 0x20(TKEYP), TKEYP 1981 je .Lenc192 1982 add $0x20, TKEYP 1983 movaps -0x60(TKEYP), KEY 1984 aesenc KEY, STATE 1985 movaps -0x50(TKEYP), KEY 1986 aesenc KEY, STATE 1987.align 4 1988.Lenc192: 1989 movaps -0x40(TKEYP), KEY 1990 aesenc KEY, STATE 1991 movaps -0x30(TKEYP), KEY 1992 aesenc KEY, STATE 1993.align 4 1994.Lenc128: 1995 movaps -0x20(TKEYP), KEY 1996 aesenc KEY, STATE 1997 movaps -0x10(TKEYP), KEY 1998 aesenc KEY, STATE 1999 movaps (TKEYP), KEY 2000 aesenc KEY, STATE 2001 movaps 0x10(TKEYP), KEY 2002 aesenc KEY, STATE 2003 movaps 0x20(TKEYP), KEY 2004 aesenc KEY, STATE 2005 movaps 0x30(TKEYP), KEY 2006 aesenc KEY, STATE 2007 movaps 0x40(TKEYP), KEY 2008 aesenc KEY, STATE 2009 movaps 0x50(TKEYP), KEY 2010 aesenc KEY, STATE 2011 movaps 0x60(TKEYP), KEY 2012 aesenc KEY, STATE 2013 movaps 0x70(TKEYP), KEY 2014 aesenclast KEY, STATE 2015 RET 2016SYM_FUNC_END(_aesni_enc1) 2017 2018/* 2019 * _aesni_enc4: internal ABI 2020 * input: 2021 * KEYP: key struct pointer 2022 * KLEN: round count 2023 * STATE1: initial state (input) 2024 * STATE2 2025 * STATE3 2026 * STATE4 2027 * output: 2028 * STATE1: finial state (output) 2029 * STATE2 2030 * STATE3 2031 * STATE4 2032 * changed: 2033 * KEY 2034 * TKEYP (T1) 2035 */ 2036SYM_FUNC_START_LOCAL(_aesni_enc4) 2037 movaps (KEYP), KEY # key 2038 mov KEYP, TKEYP 2039 pxor KEY, STATE1 # round 0 2040 pxor KEY, STATE2 2041 pxor KEY, STATE3 2042 pxor KEY, STATE4 2043 add $0x30, TKEYP 2044 cmp $24, KLEN 2045 jb .L4enc128 2046 lea 0x20(TKEYP), TKEYP 2047 je .L4enc192 2048 add $0x20, TKEYP 2049 movaps -0x60(TKEYP), KEY 2050 aesenc KEY, STATE1 2051 aesenc KEY, STATE2 2052 aesenc KEY, STATE3 2053 aesenc KEY, STATE4 2054 movaps -0x50(TKEYP), KEY 2055 aesenc KEY, STATE1 2056 aesenc KEY, STATE2 2057 aesenc KEY, STATE3 2058 aesenc KEY, STATE4 2059#.align 4 2060.L4enc192: 2061 movaps -0x40(TKEYP), KEY 2062 aesenc KEY, STATE1 2063 aesenc KEY, STATE2 2064 aesenc KEY, STATE3 2065 aesenc KEY, STATE4 2066 movaps -0x30(TKEYP), KEY 2067 aesenc KEY, STATE1 2068 aesenc KEY, STATE2 2069 aesenc KEY, STATE3 2070 aesenc KEY, STATE4 2071#.align 4 2072.L4enc128: 2073 movaps -0x20(TKEYP), KEY 2074 aesenc KEY, STATE1 2075 aesenc KEY, STATE2 2076 aesenc KEY, STATE3 2077 aesenc KEY, STATE4 2078 movaps -0x10(TKEYP), KEY 2079 aesenc KEY, STATE1 2080 aesenc KEY, STATE2 2081 aesenc KEY, STATE3 2082 aesenc KEY, STATE4 2083 movaps (TKEYP), KEY 2084 aesenc KEY, STATE1 2085 aesenc KEY, STATE2 2086 aesenc KEY, STATE3 2087 aesenc KEY, STATE4 2088 movaps 0x10(TKEYP), KEY 2089 aesenc KEY, STATE1 2090 aesenc KEY, STATE2 2091 aesenc KEY, STATE3 2092 aesenc KEY, STATE4 2093 movaps 0x20(TKEYP), KEY 2094 aesenc KEY, STATE1 2095 aesenc KEY, STATE2 2096 aesenc KEY, STATE3 2097 aesenc KEY, STATE4 2098 movaps 0x30(TKEYP), KEY 2099 aesenc KEY, STATE1 2100 aesenc KEY, STATE2 2101 aesenc KEY, STATE3 2102 aesenc KEY, STATE4 2103 movaps 0x40(TKEYP), KEY 2104 aesenc KEY, STATE1 2105 aesenc KEY, STATE2 2106 aesenc KEY, STATE3 2107 aesenc KEY, STATE4 2108 movaps 0x50(TKEYP), KEY 2109 aesenc KEY, STATE1 2110 aesenc KEY, STATE2 2111 aesenc KEY, STATE3 2112 aesenc KEY, STATE4 2113 movaps 0x60(TKEYP), KEY 2114 aesenc KEY, STATE1 2115 aesenc KEY, STATE2 2116 aesenc KEY, STATE3 2117 aesenc KEY, STATE4 2118 movaps 0x70(TKEYP), KEY 2119 aesenclast KEY, STATE1 # last round 2120 aesenclast KEY, STATE2 2121 aesenclast KEY, STATE3 2122 aesenclast KEY, STATE4 2123 RET 2124SYM_FUNC_END(_aesni_enc4) 2125 2126/* 2127 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 2128 */ 2129SYM_FUNC_START(aesni_dec) 2130 FRAME_BEGIN 2131#ifndef __x86_64__ 2132 pushl KEYP 2133 pushl KLEN 2134 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2135 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2136 movl (FRAME_OFFSET+20)(%esp), INP # src 2137#endif 2138 mov 480(KEYP), KLEN # key length 2139 add $240, KEYP 2140 movups (INP), STATE # input 2141 call _aesni_dec1 2142 movups STATE, (OUTP) #output 2143#ifndef __x86_64__ 2144 popl KLEN 2145 popl KEYP 2146#endif 2147 FRAME_END 2148 RET 2149SYM_FUNC_END(aesni_dec) 2150 2151/* 2152 * _aesni_dec1: internal ABI 2153 * input: 2154 * KEYP: key struct pointer 2155 * KLEN: key length 2156 * STATE: initial state (input) 2157 * output: 2158 * STATE: finial state (output) 2159 * changed: 2160 * KEY 2161 * TKEYP (T1) 2162 */ 2163SYM_FUNC_START_LOCAL(_aesni_dec1) 2164 movaps (KEYP), KEY # key 2165 mov KEYP, TKEYP 2166 pxor KEY, STATE # round 0 2167 add $0x30, TKEYP 2168 cmp $24, KLEN 2169 jb .Ldec128 2170 lea 0x20(TKEYP), TKEYP 2171 je .Ldec192 2172 add $0x20, TKEYP 2173 movaps -0x60(TKEYP), KEY 2174 aesdec KEY, STATE 2175 movaps -0x50(TKEYP), KEY 2176 aesdec KEY, STATE 2177.align 4 2178.Ldec192: 2179 movaps -0x40(TKEYP), KEY 2180 aesdec KEY, STATE 2181 movaps -0x30(TKEYP), KEY 2182 aesdec KEY, STATE 2183.align 4 2184.Ldec128: 2185 movaps -0x20(TKEYP), KEY 2186 aesdec KEY, STATE 2187 movaps -0x10(TKEYP), KEY 2188 aesdec KEY, STATE 2189 movaps (TKEYP), KEY 2190 aesdec KEY, STATE 2191 movaps 0x10(TKEYP), KEY 2192 aesdec KEY, STATE 2193 movaps 0x20(TKEYP), KEY 2194 aesdec KEY, STATE 2195 movaps 0x30(TKEYP), KEY 2196 aesdec KEY, STATE 2197 movaps 0x40(TKEYP), KEY 2198 aesdec KEY, STATE 2199 movaps 0x50(TKEYP), KEY 2200 aesdec KEY, STATE 2201 movaps 0x60(TKEYP), KEY 2202 aesdec KEY, STATE 2203 movaps 0x70(TKEYP), KEY 2204 aesdeclast KEY, STATE 2205 RET 2206SYM_FUNC_END(_aesni_dec1) 2207 2208/* 2209 * _aesni_dec4: internal ABI 2210 * input: 2211 * KEYP: key struct pointer 2212 * KLEN: key length 2213 * STATE1: initial state (input) 2214 * STATE2 2215 * STATE3 2216 * STATE4 2217 * output: 2218 * STATE1: finial state (output) 2219 * STATE2 2220 * STATE3 2221 * STATE4 2222 * changed: 2223 * KEY 2224 * TKEYP (T1) 2225 */ 2226SYM_FUNC_START_LOCAL(_aesni_dec4) 2227 movaps (KEYP), KEY # key 2228 mov KEYP, TKEYP 2229 pxor KEY, STATE1 # round 0 2230 pxor KEY, STATE2 2231 pxor KEY, STATE3 2232 pxor KEY, STATE4 2233 add $0x30, TKEYP 2234 cmp $24, KLEN 2235 jb .L4dec128 2236 lea 0x20(TKEYP), TKEYP 2237 je .L4dec192 2238 add $0x20, TKEYP 2239 movaps -0x60(TKEYP), KEY 2240 aesdec KEY, STATE1 2241 aesdec KEY, STATE2 2242 aesdec KEY, STATE3 2243 aesdec KEY, STATE4 2244 movaps -0x50(TKEYP), KEY 2245 aesdec KEY, STATE1 2246 aesdec KEY, STATE2 2247 aesdec KEY, STATE3 2248 aesdec KEY, STATE4 2249.align 4 2250.L4dec192: 2251 movaps -0x40(TKEYP), KEY 2252 aesdec KEY, STATE1 2253 aesdec KEY, STATE2 2254 aesdec KEY, STATE3 2255 aesdec KEY, STATE4 2256 movaps -0x30(TKEYP), KEY 2257 aesdec KEY, STATE1 2258 aesdec KEY, STATE2 2259 aesdec KEY, STATE3 2260 aesdec KEY, STATE4 2261.align 4 2262.L4dec128: 2263 movaps -0x20(TKEYP), KEY 2264 aesdec KEY, STATE1 2265 aesdec KEY, STATE2 2266 aesdec KEY, STATE3 2267 aesdec KEY, STATE4 2268 movaps -0x10(TKEYP), KEY 2269 aesdec KEY, STATE1 2270 aesdec KEY, STATE2 2271 aesdec KEY, STATE3 2272 aesdec KEY, STATE4 2273 movaps (TKEYP), KEY 2274 aesdec KEY, STATE1 2275 aesdec KEY, STATE2 2276 aesdec KEY, STATE3 2277 aesdec KEY, STATE4 2278 movaps 0x10(TKEYP), KEY 2279 aesdec KEY, STATE1 2280 aesdec KEY, STATE2 2281 aesdec KEY, STATE3 2282 aesdec KEY, STATE4 2283 movaps 0x20(TKEYP), KEY 2284 aesdec KEY, STATE1 2285 aesdec KEY, STATE2 2286 aesdec KEY, STATE3 2287 aesdec KEY, STATE4 2288 movaps 0x30(TKEYP), KEY 2289 aesdec KEY, STATE1 2290 aesdec KEY, STATE2 2291 aesdec KEY, STATE3 2292 aesdec KEY, STATE4 2293 movaps 0x40(TKEYP), KEY 2294 aesdec KEY, STATE1 2295 aesdec KEY, STATE2 2296 aesdec KEY, STATE3 2297 aesdec KEY, STATE4 2298 movaps 0x50(TKEYP), KEY 2299 aesdec KEY, STATE1 2300 aesdec KEY, STATE2 2301 aesdec KEY, STATE3 2302 aesdec KEY, STATE4 2303 movaps 0x60(TKEYP), KEY 2304 aesdec KEY, STATE1 2305 aesdec KEY, STATE2 2306 aesdec KEY, STATE3 2307 aesdec KEY, STATE4 2308 movaps 0x70(TKEYP), KEY 2309 aesdeclast KEY, STATE1 # last round 2310 aesdeclast KEY, STATE2 2311 aesdeclast KEY, STATE3 2312 aesdeclast KEY, STATE4 2313 RET 2314SYM_FUNC_END(_aesni_dec4) 2315 2316/* 2317 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2318 * size_t len) 2319 */ 2320SYM_FUNC_START(aesni_ecb_enc) 2321 FRAME_BEGIN 2322#ifndef __x86_64__ 2323 pushl LEN 2324 pushl KEYP 2325 pushl KLEN 2326 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2327 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2328 movl (FRAME_OFFSET+24)(%esp), INP # src 2329 movl (FRAME_OFFSET+28)(%esp), LEN # len 2330#endif 2331 test LEN, LEN # check length 2332 jz .Lecb_enc_ret 2333 mov 480(KEYP), KLEN 2334 cmp $16, LEN 2335 jb .Lecb_enc_ret 2336 cmp $64, LEN 2337 jb .Lecb_enc_loop1 2338.align 4 2339.Lecb_enc_loop4: 2340 movups (INP), STATE1 2341 movups 0x10(INP), STATE2 2342 movups 0x20(INP), STATE3 2343 movups 0x30(INP), STATE4 2344 call _aesni_enc4 2345 movups STATE1, (OUTP) 2346 movups STATE2, 0x10(OUTP) 2347 movups STATE3, 0x20(OUTP) 2348 movups STATE4, 0x30(OUTP) 2349 sub $64, LEN 2350 add $64, INP 2351 add $64, OUTP 2352 cmp $64, LEN 2353 jge .Lecb_enc_loop4 2354 cmp $16, LEN 2355 jb .Lecb_enc_ret 2356.align 4 2357.Lecb_enc_loop1: 2358 movups (INP), STATE1 2359 call _aesni_enc1 2360 movups STATE1, (OUTP) 2361 sub $16, LEN 2362 add $16, INP 2363 add $16, OUTP 2364 cmp $16, LEN 2365 jge .Lecb_enc_loop1 2366.Lecb_enc_ret: 2367#ifndef __x86_64__ 2368 popl KLEN 2369 popl KEYP 2370 popl LEN 2371#endif 2372 FRAME_END 2373 RET 2374SYM_FUNC_END(aesni_ecb_enc) 2375 2376/* 2377 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2378 * size_t len); 2379 */ 2380SYM_FUNC_START(aesni_ecb_dec) 2381 FRAME_BEGIN 2382#ifndef __x86_64__ 2383 pushl LEN 2384 pushl KEYP 2385 pushl KLEN 2386 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2387 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2388 movl (FRAME_OFFSET+24)(%esp), INP # src 2389 movl (FRAME_OFFSET+28)(%esp), LEN # len 2390#endif 2391 test LEN, LEN 2392 jz .Lecb_dec_ret 2393 mov 480(KEYP), KLEN 2394 add $240, KEYP 2395 cmp $16, LEN 2396 jb .Lecb_dec_ret 2397 cmp $64, LEN 2398 jb .Lecb_dec_loop1 2399.align 4 2400.Lecb_dec_loop4: 2401 movups (INP), STATE1 2402 movups 0x10(INP), STATE2 2403 movups 0x20(INP), STATE3 2404 movups 0x30(INP), STATE4 2405 call _aesni_dec4 2406 movups STATE1, (OUTP) 2407 movups STATE2, 0x10(OUTP) 2408 movups STATE3, 0x20(OUTP) 2409 movups STATE4, 0x30(OUTP) 2410 sub $64, LEN 2411 add $64, INP 2412 add $64, OUTP 2413 cmp $64, LEN 2414 jge .Lecb_dec_loop4 2415 cmp $16, LEN 2416 jb .Lecb_dec_ret 2417.align 4 2418.Lecb_dec_loop1: 2419 movups (INP), STATE1 2420 call _aesni_dec1 2421 movups STATE1, (OUTP) 2422 sub $16, LEN 2423 add $16, INP 2424 add $16, OUTP 2425 cmp $16, LEN 2426 jge .Lecb_dec_loop1 2427.Lecb_dec_ret: 2428#ifndef __x86_64__ 2429 popl KLEN 2430 popl KEYP 2431 popl LEN 2432#endif 2433 FRAME_END 2434 RET 2435SYM_FUNC_END(aesni_ecb_dec) 2436 2437/* 2438 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2439 * size_t len, u8 *iv) 2440 */ 2441SYM_FUNC_START(aesni_cbc_enc) 2442 FRAME_BEGIN 2443#ifndef __x86_64__ 2444 pushl IVP 2445 pushl LEN 2446 pushl KEYP 2447 pushl KLEN 2448 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2449 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2450 movl (FRAME_OFFSET+28)(%esp), INP # src 2451 movl (FRAME_OFFSET+32)(%esp), LEN # len 2452 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2453#endif 2454 cmp $16, LEN 2455 jb .Lcbc_enc_ret 2456 mov 480(KEYP), KLEN 2457 movups (IVP), STATE # load iv as initial state 2458.align 4 2459.Lcbc_enc_loop: 2460 movups (INP), IN # load input 2461 pxor IN, STATE 2462 call _aesni_enc1 2463 movups STATE, (OUTP) # store output 2464 sub $16, LEN 2465 add $16, INP 2466 add $16, OUTP 2467 cmp $16, LEN 2468 jge .Lcbc_enc_loop 2469 movups STATE, (IVP) 2470.Lcbc_enc_ret: 2471#ifndef __x86_64__ 2472 popl KLEN 2473 popl KEYP 2474 popl LEN 2475 popl IVP 2476#endif 2477 FRAME_END 2478 RET 2479SYM_FUNC_END(aesni_cbc_enc) 2480 2481/* 2482 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2483 * size_t len, u8 *iv) 2484 */ 2485SYM_FUNC_START(aesni_cbc_dec) 2486 FRAME_BEGIN 2487#ifndef __x86_64__ 2488 pushl IVP 2489 pushl LEN 2490 pushl KEYP 2491 pushl KLEN 2492 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2493 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2494 movl (FRAME_OFFSET+28)(%esp), INP # src 2495 movl (FRAME_OFFSET+32)(%esp), LEN # len 2496 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2497#endif 2498 cmp $16, LEN 2499 jb .Lcbc_dec_just_ret 2500 mov 480(KEYP), KLEN 2501 add $240, KEYP 2502 movups (IVP), IV 2503 cmp $64, LEN 2504 jb .Lcbc_dec_loop1 2505.align 4 2506.Lcbc_dec_loop4: 2507 movups (INP), IN1 2508 movaps IN1, STATE1 2509 movups 0x10(INP), IN2 2510 movaps IN2, STATE2 2511#ifdef __x86_64__ 2512 movups 0x20(INP), IN3 2513 movaps IN3, STATE3 2514 movups 0x30(INP), IN4 2515 movaps IN4, STATE4 2516#else 2517 movups 0x20(INP), IN1 2518 movaps IN1, STATE3 2519 movups 0x30(INP), IN2 2520 movaps IN2, STATE4 2521#endif 2522 call _aesni_dec4 2523 pxor IV, STATE1 2524#ifdef __x86_64__ 2525 pxor IN1, STATE2 2526 pxor IN2, STATE3 2527 pxor IN3, STATE4 2528 movaps IN4, IV 2529#else 2530 pxor IN1, STATE4 2531 movaps IN2, IV 2532 movups (INP), IN1 2533 pxor IN1, STATE2 2534 movups 0x10(INP), IN2 2535 pxor IN2, STATE3 2536#endif 2537 movups STATE1, (OUTP) 2538 movups STATE2, 0x10(OUTP) 2539 movups STATE3, 0x20(OUTP) 2540 movups STATE4, 0x30(OUTP) 2541 sub $64, LEN 2542 add $64, INP 2543 add $64, OUTP 2544 cmp $64, LEN 2545 jge .Lcbc_dec_loop4 2546 cmp $16, LEN 2547 jb .Lcbc_dec_ret 2548.align 4 2549.Lcbc_dec_loop1: 2550 movups (INP), IN 2551 movaps IN, STATE 2552 call _aesni_dec1 2553 pxor IV, STATE 2554 movups STATE, (OUTP) 2555 movaps IN, IV 2556 sub $16, LEN 2557 add $16, INP 2558 add $16, OUTP 2559 cmp $16, LEN 2560 jge .Lcbc_dec_loop1 2561.Lcbc_dec_ret: 2562 movups IV, (IVP) 2563.Lcbc_dec_just_ret: 2564#ifndef __x86_64__ 2565 popl KLEN 2566 popl KEYP 2567 popl LEN 2568 popl IVP 2569#endif 2570 FRAME_END 2571 RET 2572SYM_FUNC_END(aesni_cbc_dec) 2573 2574/* 2575 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2576 * size_t len, u8 *iv) 2577 */ 2578SYM_FUNC_START(aesni_cts_cbc_enc) 2579 FRAME_BEGIN 2580#ifndef __x86_64__ 2581 pushl IVP 2582 pushl LEN 2583 pushl KEYP 2584 pushl KLEN 2585 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2586 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2587 movl (FRAME_OFFSET+28)(%esp), INP # src 2588 movl (FRAME_OFFSET+32)(%esp), LEN # len 2589 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2590 lea .Lcts_permute_table, T1 2591#else 2592 lea .Lcts_permute_table(%rip), T1 2593#endif 2594 mov 480(KEYP), KLEN 2595 movups (IVP), STATE 2596 sub $16, LEN 2597 mov T1, IVP 2598 add $32, IVP 2599 add LEN, T1 2600 sub LEN, IVP 2601 movups (T1), %xmm4 2602 movups (IVP), %xmm5 2603 2604 movups (INP), IN1 2605 add LEN, INP 2606 movups (INP), IN2 2607 2608 pxor IN1, STATE 2609 call _aesni_enc1 2610 2611 pshufb %xmm5, IN2 2612 pxor STATE, IN2 2613 pshufb %xmm4, STATE 2614 add OUTP, LEN 2615 movups STATE, (LEN) 2616 2617 movaps IN2, STATE 2618 call _aesni_enc1 2619 movups STATE, (OUTP) 2620 2621#ifndef __x86_64__ 2622 popl KLEN 2623 popl KEYP 2624 popl LEN 2625 popl IVP 2626#endif 2627 FRAME_END 2628 RET 2629SYM_FUNC_END(aesni_cts_cbc_enc) 2630 2631/* 2632 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2633 * size_t len, u8 *iv) 2634 */ 2635SYM_FUNC_START(aesni_cts_cbc_dec) 2636 FRAME_BEGIN 2637#ifndef __x86_64__ 2638 pushl IVP 2639 pushl LEN 2640 pushl KEYP 2641 pushl KLEN 2642 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2643 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2644 movl (FRAME_OFFSET+28)(%esp), INP # src 2645 movl (FRAME_OFFSET+32)(%esp), LEN # len 2646 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2647 lea .Lcts_permute_table, T1 2648#else 2649 lea .Lcts_permute_table(%rip), T1 2650#endif 2651 mov 480(KEYP), KLEN 2652 add $240, KEYP 2653 movups (IVP), IV 2654 sub $16, LEN 2655 mov T1, IVP 2656 add $32, IVP 2657 add LEN, T1 2658 sub LEN, IVP 2659 movups (T1), %xmm4 2660 2661 movups (INP), STATE 2662 add LEN, INP 2663 movups (INP), IN1 2664 2665 call _aesni_dec1 2666 movaps STATE, IN2 2667 pshufb %xmm4, STATE 2668 pxor IN1, STATE 2669 2670 add OUTP, LEN 2671 movups STATE, (LEN) 2672 2673 movups (IVP), %xmm0 2674 pshufb %xmm0, IN1 2675 pblendvb IN2, IN1 2676 movaps IN1, STATE 2677 call _aesni_dec1 2678 2679 pxor IV, STATE 2680 movups STATE, (OUTP) 2681 2682#ifndef __x86_64__ 2683 popl KLEN 2684 popl KEYP 2685 popl LEN 2686 popl IVP 2687#endif 2688 FRAME_END 2689 RET 2690SYM_FUNC_END(aesni_cts_cbc_dec) 2691 2692.pushsection .rodata 2693.align 16 2694.Lcts_permute_table: 2695 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2696 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2697 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 2698 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 2699 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2700 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 2701#ifdef __x86_64__ 2702.Lbswap_mask: 2703 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2704#endif 2705.popsection 2706 2707#ifdef __x86_64__ 2708/* 2709 * _aesni_inc_init: internal ABI 2710 * setup registers used by _aesni_inc 2711 * input: 2712 * IV 2713 * output: 2714 * CTR: == IV, in little endian 2715 * TCTR_LOW: == lower qword of CTR 2716 * INC: == 1, in little endian 2717 * BSWAP_MASK == endian swapping mask 2718 */ 2719SYM_FUNC_START_LOCAL(_aesni_inc_init) 2720 movaps .Lbswap_mask(%rip), BSWAP_MASK 2721 movaps IV, CTR 2722 pshufb BSWAP_MASK, CTR 2723 mov $1, TCTR_LOW 2724 movq TCTR_LOW, INC 2725 movq CTR, TCTR_LOW 2726 RET 2727SYM_FUNC_END(_aesni_inc_init) 2728 2729/* 2730 * _aesni_inc: internal ABI 2731 * Increase IV by 1, IV is in big endian 2732 * input: 2733 * IV 2734 * CTR: == IV, in little endian 2735 * TCTR_LOW: == lower qword of CTR 2736 * INC: == 1, in little endian 2737 * BSWAP_MASK == endian swapping mask 2738 * output: 2739 * IV: Increase by 1 2740 * changed: 2741 * CTR: == output IV, in little endian 2742 * TCTR_LOW: == lower qword of CTR 2743 */ 2744SYM_FUNC_START_LOCAL(_aesni_inc) 2745 paddq INC, CTR 2746 add $1, TCTR_LOW 2747 jnc .Linc_low 2748 pslldq $8, INC 2749 paddq INC, CTR 2750 psrldq $8, INC 2751.Linc_low: 2752 movaps CTR, IV 2753 pshufb BSWAP_MASK, IV 2754 RET 2755SYM_FUNC_END(_aesni_inc) 2756 2757/* 2758 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2759 * size_t len, u8 *iv) 2760 */ 2761SYM_FUNC_START(aesni_ctr_enc) 2762 FRAME_BEGIN 2763 cmp $16, LEN 2764 jb .Lctr_enc_just_ret 2765 mov 480(KEYP), KLEN 2766 movups (IVP), IV 2767 call _aesni_inc_init 2768 cmp $64, LEN 2769 jb .Lctr_enc_loop1 2770.align 4 2771.Lctr_enc_loop4: 2772 movaps IV, STATE1 2773 call _aesni_inc 2774 movups (INP), IN1 2775 movaps IV, STATE2 2776 call _aesni_inc 2777 movups 0x10(INP), IN2 2778 movaps IV, STATE3 2779 call _aesni_inc 2780 movups 0x20(INP), IN3 2781 movaps IV, STATE4 2782 call _aesni_inc 2783 movups 0x30(INP), IN4 2784 call _aesni_enc4 2785 pxor IN1, STATE1 2786 movups STATE1, (OUTP) 2787 pxor IN2, STATE2 2788 movups STATE2, 0x10(OUTP) 2789 pxor IN3, STATE3 2790 movups STATE3, 0x20(OUTP) 2791 pxor IN4, STATE4 2792 movups STATE4, 0x30(OUTP) 2793 sub $64, LEN 2794 add $64, INP 2795 add $64, OUTP 2796 cmp $64, LEN 2797 jge .Lctr_enc_loop4 2798 cmp $16, LEN 2799 jb .Lctr_enc_ret 2800.align 4 2801.Lctr_enc_loop1: 2802 movaps IV, STATE 2803 call _aesni_inc 2804 movups (INP), IN 2805 call _aesni_enc1 2806 pxor IN, STATE 2807 movups STATE, (OUTP) 2808 sub $16, LEN 2809 add $16, INP 2810 add $16, OUTP 2811 cmp $16, LEN 2812 jge .Lctr_enc_loop1 2813.Lctr_enc_ret: 2814 movups IV, (IVP) 2815.Lctr_enc_just_ret: 2816 FRAME_END 2817 RET 2818SYM_FUNC_END(aesni_ctr_enc) 2819 2820#endif 2821 2822.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 2823.align 16 2824.Lgf128mul_x_ble_mask: 2825 .octa 0x00000000000000010000000000000087 2826.previous 2827 2828/* 2829 * _aesni_gf128mul_x_ble: internal ABI 2830 * Multiply in GF(2^128) for XTS IVs 2831 * input: 2832 * IV: current IV 2833 * GF128MUL_MASK == mask with 0x87 and 0x01 2834 * output: 2835 * IV: next IV 2836 * changed: 2837 * CTR: == temporary value 2838 */ 2839#define _aesni_gf128mul_x_ble() \ 2840 pshufd $0x13, IV, KEY; \ 2841 paddq IV, IV; \ 2842 psrad $31, KEY; \ 2843 pand GF128MUL_MASK, KEY; \ 2844 pxor KEY, IV; 2845 2846/* 2847 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, 2848 * const u8 *src, unsigned int len, le128 *iv) 2849 */ 2850SYM_FUNC_START(aesni_xts_encrypt) 2851 FRAME_BEGIN 2852#ifndef __x86_64__ 2853 pushl IVP 2854 pushl LEN 2855 pushl KEYP 2856 pushl KLEN 2857 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2858 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2859 movl (FRAME_OFFSET+28)(%esp), INP # src 2860 movl (FRAME_OFFSET+32)(%esp), LEN # len 2861 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2862 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2863#else 2864 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 2865#endif 2866 movups (IVP), IV 2867 2868 mov 480(KEYP), KLEN 2869 2870.Lxts_enc_loop4: 2871 sub $64, LEN 2872 jl .Lxts_enc_1x 2873 2874 movdqa IV, STATE1 2875 movdqu 0x00(INP), IN 2876 pxor IN, STATE1 2877 movdqu IV, 0x00(OUTP) 2878 2879 _aesni_gf128mul_x_ble() 2880 movdqa IV, STATE2 2881 movdqu 0x10(INP), IN 2882 pxor IN, STATE2 2883 movdqu IV, 0x10(OUTP) 2884 2885 _aesni_gf128mul_x_ble() 2886 movdqa IV, STATE3 2887 movdqu 0x20(INP), IN 2888 pxor IN, STATE3 2889 movdqu IV, 0x20(OUTP) 2890 2891 _aesni_gf128mul_x_ble() 2892 movdqa IV, STATE4 2893 movdqu 0x30(INP), IN 2894 pxor IN, STATE4 2895 movdqu IV, 0x30(OUTP) 2896 2897 call _aesni_enc4 2898 2899 movdqu 0x00(OUTP), IN 2900 pxor IN, STATE1 2901 movdqu STATE1, 0x00(OUTP) 2902 2903 movdqu 0x10(OUTP), IN 2904 pxor IN, STATE2 2905 movdqu STATE2, 0x10(OUTP) 2906 2907 movdqu 0x20(OUTP), IN 2908 pxor IN, STATE3 2909 movdqu STATE3, 0x20(OUTP) 2910 2911 movdqu 0x30(OUTP), IN 2912 pxor IN, STATE4 2913 movdqu STATE4, 0x30(OUTP) 2914 2915 _aesni_gf128mul_x_ble() 2916 2917 add $64, INP 2918 add $64, OUTP 2919 test LEN, LEN 2920 jnz .Lxts_enc_loop4 2921 2922.Lxts_enc_ret_iv: 2923 movups IV, (IVP) 2924 2925.Lxts_enc_ret: 2926#ifndef __x86_64__ 2927 popl KLEN 2928 popl KEYP 2929 popl LEN 2930 popl IVP 2931#endif 2932 FRAME_END 2933 RET 2934 2935.Lxts_enc_1x: 2936 add $64, LEN 2937 jz .Lxts_enc_ret_iv 2938 sub $16, LEN 2939 jl .Lxts_enc_cts4 2940 2941.Lxts_enc_loop1: 2942 movdqu (INP), STATE 2943 pxor IV, STATE 2944 call _aesni_enc1 2945 pxor IV, STATE 2946 _aesni_gf128mul_x_ble() 2947 2948 test LEN, LEN 2949 jz .Lxts_enc_out 2950 2951 add $16, INP 2952 sub $16, LEN 2953 jl .Lxts_enc_cts1 2954 2955 movdqu STATE, (OUTP) 2956 add $16, OUTP 2957 jmp .Lxts_enc_loop1 2958 2959.Lxts_enc_out: 2960 movdqu STATE, (OUTP) 2961 jmp .Lxts_enc_ret_iv 2962 2963.Lxts_enc_cts4: 2964 movdqa STATE4, STATE 2965 sub $16, OUTP 2966 2967.Lxts_enc_cts1: 2968#ifndef __x86_64__ 2969 lea .Lcts_permute_table, T1 2970#else 2971 lea .Lcts_permute_table(%rip), T1 2972#endif 2973 add LEN, INP /* rewind input pointer */ 2974 add $16, LEN /* # bytes in final block */ 2975 movups (INP), IN1 2976 2977 mov T1, IVP 2978 add $32, IVP 2979 add LEN, T1 2980 sub LEN, IVP 2981 add OUTP, LEN 2982 2983 movups (T1), %xmm4 2984 movaps STATE, IN2 2985 pshufb %xmm4, STATE 2986 movups STATE, (LEN) 2987 2988 movups (IVP), %xmm0 2989 pshufb %xmm0, IN1 2990 pblendvb IN2, IN1 2991 movaps IN1, STATE 2992 2993 pxor IV, STATE 2994 call _aesni_enc1 2995 pxor IV, STATE 2996 2997 movups STATE, (OUTP) 2998 jmp .Lxts_enc_ret 2999SYM_FUNC_END(aesni_xts_encrypt) 3000 3001/* 3002 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, 3003 * const u8 *src, unsigned int len, le128 *iv) 3004 */ 3005SYM_FUNC_START(aesni_xts_decrypt) 3006 FRAME_BEGIN 3007#ifndef __x86_64__ 3008 pushl IVP 3009 pushl LEN 3010 pushl KEYP 3011 pushl KLEN 3012 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 3013 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 3014 movl (FRAME_OFFSET+28)(%esp), INP # src 3015 movl (FRAME_OFFSET+32)(%esp), LEN # len 3016 movl (FRAME_OFFSET+36)(%esp), IVP # iv 3017 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 3018#else 3019 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK 3020#endif 3021 movups (IVP), IV 3022 3023 mov 480(KEYP), KLEN 3024 add $240, KEYP 3025 3026 test $15, LEN 3027 jz .Lxts_dec_loop4 3028 sub $16, LEN 3029 3030.Lxts_dec_loop4: 3031 sub $64, LEN 3032 jl .Lxts_dec_1x 3033 3034 movdqa IV, STATE1 3035 movdqu 0x00(INP), IN 3036 pxor IN, STATE1 3037 movdqu IV, 0x00(OUTP) 3038 3039 _aesni_gf128mul_x_ble() 3040 movdqa IV, STATE2 3041 movdqu 0x10(INP), IN 3042 pxor IN, STATE2 3043 movdqu IV, 0x10(OUTP) 3044 3045 _aesni_gf128mul_x_ble() 3046 movdqa IV, STATE3 3047 movdqu 0x20(INP), IN 3048 pxor IN, STATE3 3049 movdqu IV, 0x20(OUTP) 3050 3051 _aesni_gf128mul_x_ble() 3052 movdqa IV, STATE4 3053 movdqu 0x30(INP), IN 3054 pxor IN, STATE4 3055 movdqu IV, 0x30(OUTP) 3056 3057 call _aesni_dec4 3058 3059 movdqu 0x00(OUTP), IN 3060 pxor IN, STATE1 3061 movdqu STATE1, 0x00(OUTP) 3062 3063 movdqu 0x10(OUTP), IN 3064 pxor IN, STATE2 3065 movdqu STATE2, 0x10(OUTP) 3066 3067 movdqu 0x20(OUTP), IN 3068 pxor IN, STATE3 3069 movdqu STATE3, 0x20(OUTP) 3070 3071 movdqu 0x30(OUTP), IN 3072 pxor IN, STATE4 3073 movdqu STATE4, 0x30(OUTP) 3074 3075 _aesni_gf128mul_x_ble() 3076 3077 add $64, INP 3078 add $64, OUTP 3079 test LEN, LEN 3080 jnz .Lxts_dec_loop4 3081 3082.Lxts_dec_ret_iv: 3083 movups IV, (IVP) 3084 3085.Lxts_dec_ret: 3086#ifndef __x86_64__ 3087 popl KLEN 3088 popl KEYP 3089 popl LEN 3090 popl IVP 3091#endif 3092 FRAME_END 3093 RET 3094 3095.Lxts_dec_1x: 3096 add $64, LEN 3097 jz .Lxts_dec_ret_iv 3098 3099.Lxts_dec_loop1: 3100 movdqu (INP), STATE 3101 3102 add $16, INP 3103 sub $16, LEN 3104 jl .Lxts_dec_cts1 3105 3106 pxor IV, STATE 3107 call _aesni_dec1 3108 pxor IV, STATE 3109 _aesni_gf128mul_x_ble() 3110 3111 test LEN, LEN 3112 jz .Lxts_dec_out 3113 3114 movdqu STATE, (OUTP) 3115 add $16, OUTP 3116 jmp .Lxts_dec_loop1 3117 3118.Lxts_dec_out: 3119 movdqu STATE, (OUTP) 3120 jmp .Lxts_dec_ret_iv 3121 3122.Lxts_dec_cts1: 3123 movdqa IV, STATE4 3124 _aesni_gf128mul_x_ble() 3125 3126 pxor IV, STATE 3127 call _aesni_dec1 3128 pxor IV, STATE 3129 3130#ifndef __x86_64__ 3131 lea .Lcts_permute_table, T1 3132#else 3133 lea .Lcts_permute_table(%rip), T1 3134#endif 3135 add LEN, INP /* rewind input pointer */ 3136 add $16, LEN /* # bytes in final block */ 3137 movups (INP), IN1 3138 3139 mov T1, IVP 3140 add $32, IVP 3141 add LEN, T1 3142 sub LEN, IVP 3143 add OUTP, LEN 3144 3145 movups (T1), %xmm4 3146 movaps STATE, IN2 3147 pshufb %xmm4, STATE 3148 movups STATE, (LEN) 3149 3150 movups (IVP), %xmm0 3151 pshufb %xmm0, IN1 3152 pblendvb IN2, IN1 3153 movaps IN1, STATE 3154 3155 pxor STATE4, STATE 3156 call _aesni_dec1 3157 pxor STATE4, STATE 3158 3159 movups STATE, (OUTP) 3160 jmp .Lxts_dec_ret 3161SYM_FUNC_END(aesni_xts_decrypt) 3162