1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 * interface for 64-bit kernels. 15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 * Adrian Hoban <adrian.hoban@intel.com> 18 * James Guilford (james.guilford@intel.com) 19 * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 * Tadeusz Struk (tadeusz.struk@intel.com) 21 * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 * Copyright (c) 2010, Intel Corporation. 23 * 24 * Ported x86_64 version to x86: 25 * Author: Mathias Krause <minipli@googlemail.com> 26 */ 27 28#include <linux/linkage.h> 29#include <asm/frame.h> 30#include <asm/nospec-branch.h> 31 32/* 33 * The following macros are used to move an (un)aligned 16 byte value to/from 34 * an XMM register. This can done for either FP or integer values, for FP use 35 * movaps (move aligned packed single) or integer use movdqa (move double quad 36 * aligned). It doesn't make a performance difference which instruction is used 37 * since Nehalem (original Core i7) was released. However, the movaps is a byte 38 * shorter, so that is the one we'll use for now. (same for unaligned). 39 */ 40#define MOVADQ movaps 41#define MOVUDQ movups 42 43#ifdef __x86_64__ 44 45# constants in mergeable sections, linker can reorder and merge 46.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 47.align 16 48.Lgf128mul_x_ble_mask: 49 .octa 0x00000000000000010000000000000087 50.section .rodata.cst16.POLY, "aM", @progbits, 16 51.align 16 52POLY: .octa 0xC2000000000000000000000000000001 53.section .rodata.cst16.TWOONE, "aM", @progbits, 16 54.align 16 55TWOONE: .octa 0x00000001000000000000000000000001 56 57.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 58.align 16 59SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 60.section .rodata.cst16.MASK1, "aM", @progbits, 16 61.align 16 62MASK1: .octa 0x0000000000000000ffffffffffffffff 63.section .rodata.cst16.MASK2, "aM", @progbits, 16 64.align 16 65MASK2: .octa 0xffffffffffffffff0000000000000000 66.section .rodata.cst16.ONE, "aM", @progbits, 16 67.align 16 68ONE: .octa 0x00000000000000000000000000000001 69.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 70.align 16 71F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 72.section .rodata.cst16.dec, "aM", @progbits, 16 73.align 16 74dec: .octa 0x1 75.section .rodata.cst16.enc, "aM", @progbits, 16 76.align 16 77enc: .octa 0x2 78 79# order of these constants should not change. 80# more specifically, ALL_F should follow SHIFT_MASK, 81# and zero should follow ALL_F 82.section .rodata, "a", @progbits 83.align 16 84SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 85ALL_F: .octa 0xffffffffffffffffffffffffffffffff 86 .octa 0x00000000000000000000000000000000 87 88.text 89 90 91#define STACK_OFFSET 8*3 92 93#define AadHash 16*0 94#define AadLen 16*1 95#define InLen (16*1)+8 96#define PBlockEncKey 16*2 97#define OrigIV 16*3 98#define CurCount 16*4 99#define PBlockLen 16*5 100#define HashKey 16*6 // store HashKey <<1 mod poly here 101#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 102#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 103#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 104#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 105 // bits of HashKey <<1 mod poly here 106 //(for Karatsuba purposes) 107#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 108 // bits of HashKey^2 <<1 mod poly here 109 // (for Karatsuba purposes) 110#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 111 // bits of HashKey^3 <<1 mod poly here 112 // (for Karatsuba purposes) 113#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 114 // bits of HashKey^4 <<1 mod poly here 115 // (for Karatsuba purposes) 116 117#define arg1 rdi 118#define arg2 rsi 119#define arg3 rdx 120#define arg4 rcx 121#define arg5 r8 122#define arg6 r9 123#define arg7 STACK_OFFSET+8(%rsp) 124#define arg8 STACK_OFFSET+16(%rsp) 125#define arg9 STACK_OFFSET+24(%rsp) 126#define arg10 STACK_OFFSET+32(%rsp) 127#define arg11 STACK_OFFSET+40(%rsp) 128#define keysize 2*15*16(%arg1) 129#endif 130 131 132#define STATE1 %xmm0 133#define STATE2 %xmm4 134#define STATE3 %xmm5 135#define STATE4 %xmm6 136#define STATE STATE1 137#define IN1 %xmm1 138#define IN2 %xmm7 139#define IN3 %xmm8 140#define IN4 %xmm9 141#define IN IN1 142#define KEY %xmm2 143#define IV %xmm3 144 145#define BSWAP_MASK %xmm10 146#define CTR %xmm11 147#define INC %xmm12 148 149#define GF128MUL_MASK %xmm10 150 151#ifdef __x86_64__ 152#define AREG %rax 153#define KEYP %rdi 154#define OUTP %rsi 155#define UKEYP OUTP 156#define INP %rdx 157#define LEN %rcx 158#define IVP %r8 159#define KLEN %r9d 160#define T1 %r10 161#define TKEYP T1 162#define T2 %r11 163#define TCTR_LOW T2 164#else 165#define AREG %eax 166#define KEYP %edi 167#define OUTP AREG 168#define UKEYP OUTP 169#define INP %edx 170#define LEN %esi 171#define IVP %ebp 172#define KLEN %ebx 173#define T1 %ecx 174#define TKEYP T1 175#endif 176 177.macro FUNC_SAVE 178 push %r12 179 push %r13 180 push %r14 181# 182# states of %xmm registers %xmm6:%xmm15 not saved 183# all %xmm registers are clobbered 184# 185.endm 186 187 188.macro FUNC_RESTORE 189 pop %r14 190 pop %r13 191 pop %r12 192.endm 193 194# Precompute hashkeys. 195# Input: Hash subkey. 196# Output: HashKeys stored in gcm_context_data. Only needs to be called 197# once per key. 198# clobbers r12, and tmp xmm registers. 199.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 200 mov \SUBKEY, %r12 201 movdqu (%r12), \TMP3 202 movdqa SHUF_MASK(%rip), \TMP2 203 pshufb \TMP2, \TMP3 204 205 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 206 207 movdqa \TMP3, \TMP2 208 psllq $1, \TMP3 209 psrlq $63, \TMP2 210 movdqa \TMP2, \TMP1 211 pslldq $8, \TMP2 212 psrldq $8, \TMP1 213 por \TMP2, \TMP3 214 215 # reduce HashKey<<1 216 217 pshufd $0x24, \TMP1, \TMP2 218 pcmpeqd TWOONE(%rip), \TMP2 219 pand POLY(%rip), \TMP2 220 pxor \TMP2, \TMP3 221 movdqu \TMP3, HashKey(%arg2) 222 223 movdqa \TMP3, \TMP5 224 pshufd $78, \TMP3, \TMP1 225 pxor \TMP3, \TMP1 226 movdqu \TMP1, HashKey_k(%arg2) 227 228 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 229# TMP5 = HashKey^2<<1 (mod poly) 230 movdqu \TMP5, HashKey_2(%arg2) 231# HashKey_2 = HashKey^2<<1 (mod poly) 232 pshufd $78, \TMP5, \TMP1 233 pxor \TMP5, \TMP1 234 movdqu \TMP1, HashKey_2_k(%arg2) 235 236 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 237# TMP5 = HashKey^3<<1 (mod poly) 238 movdqu \TMP5, HashKey_3(%arg2) 239 pshufd $78, \TMP5, \TMP1 240 pxor \TMP5, \TMP1 241 movdqu \TMP1, HashKey_3_k(%arg2) 242 243 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 244# TMP5 = HashKey^3<<1 (mod poly) 245 movdqu \TMP5, HashKey_4(%arg2) 246 pshufd $78, \TMP5, \TMP1 247 pxor \TMP5, \TMP1 248 movdqu \TMP1, HashKey_4_k(%arg2) 249.endm 250 251# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 252# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 253.macro GCM_INIT Iv SUBKEY AAD AADLEN 254 mov \AADLEN, %r11 255 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 256 xor %r11d, %r11d 257 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 258 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 259 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 260 mov \Iv, %rax 261 movdqu (%rax), %xmm0 262 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 263 264 movdqa SHUF_MASK(%rip), %xmm2 265 pshufb %xmm2, %xmm0 266 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 267 268 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 269 movdqu HashKey(%arg2), %xmm13 270 271 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 272 %xmm4, %xmm5, %xmm6 273.endm 274 275# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 276# struct has been initialized by GCM_INIT. 277# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 278# Clobbers rax, r10-r13, and xmm0-xmm15 279.macro GCM_ENC_DEC operation 280 movdqu AadHash(%arg2), %xmm8 281 movdqu HashKey(%arg2), %xmm13 282 add %arg5, InLen(%arg2) 283 284 xor %r11d, %r11d # initialise the data pointer offset as zero 285 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 286 287 sub %r11, %arg5 # sub partial block data used 288 mov %arg5, %r13 # save the number of bytes 289 290 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 291 mov %r13, %r12 292 # Encrypt/Decrypt first few blocks 293 294 and $(3<<4), %r12 295 jz _initial_num_blocks_is_0_\@ 296 cmp $(2<<4), %r12 297 jb _initial_num_blocks_is_1_\@ 298 je _initial_num_blocks_is_2_\@ 299_initial_num_blocks_is_3_\@: 300 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 301%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 302 sub $48, %r13 303 jmp _initial_blocks_\@ 304_initial_num_blocks_is_2_\@: 305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 307 sub $32, %r13 308 jmp _initial_blocks_\@ 309_initial_num_blocks_is_1_\@: 310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 312 sub $16, %r13 313 jmp _initial_blocks_\@ 314_initial_num_blocks_is_0_\@: 315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 317_initial_blocks_\@: 318 319 # Main loop - Encrypt/Decrypt remaining blocks 320 321 cmp $0, %r13 322 je _zero_cipher_left_\@ 323 sub $64, %r13 324 je _four_cipher_left_\@ 325_crypt_by_4_\@: 326 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 327 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 328 %xmm7, %xmm8, enc 329 add $64, %r11 330 sub $64, %r13 331 jne _crypt_by_4_\@ 332_four_cipher_left_\@: 333 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 334%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 335_zero_cipher_left_\@: 336 movdqu %xmm8, AadHash(%arg2) 337 movdqu %xmm0, CurCount(%arg2) 338 339 mov %arg5, %r13 340 and $15, %r13 # %r13 = arg5 (mod 16) 341 je _multiple_of_16_bytes_\@ 342 343 mov %r13, PBlockLen(%arg2) 344 345 # Handle the last <16 Byte block separately 346 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 347 movdqu %xmm0, CurCount(%arg2) 348 movdqa SHUF_MASK(%rip), %xmm10 349 pshufb %xmm10, %xmm0 350 351 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 352 movdqu %xmm0, PBlockEncKey(%arg2) 353 354 cmp $16, %arg5 355 jge _large_enough_update_\@ 356 357 lea (%arg4,%r11,1), %r10 358 mov %r13, %r12 359 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 360 jmp _data_read_\@ 361 362_large_enough_update_\@: 363 sub $16, %r11 364 add %r13, %r11 365 366 # receive the last <16 Byte block 367 movdqu (%arg4, %r11, 1), %xmm1 368 369 sub %r13, %r11 370 add $16, %r11 371 372 lea SHIFT_MASK+16(%rip), %r12 373 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 374 # (r13 is the number of bytes in plaintext mod 16) 375 sub %r13, %r12 376 # get the appropriate shuffle mask 377 movdqu (%r12), %xmm2 378 # shift right 16-r13 bytes 379 pshufb %xmm2, %xmm1 380 381_data_read_\@: 382 lea ALL_F+16(%rip), %r12 383 sub %r13, %r12 384 385.ifc \operation, dec 386 movdqa %xmm1, %xmm2 387.endif 388 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 389 movdqu (%r12), %xmm1 390 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 391 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 392.ifc \operation, dec 393 pand %xmm1, %xmm2 394 movdqa SHUF_MASK(%rip), %xmm10 395 pshufb %xmm10 ,%xmm2 396 397 pxor %xmm2, %xmm8 398.else 399 movdqa SHUF_MASK(%rip), %xmm10 400 pshufb %xmm10,%xmm0 401 402 pxor %xmm0, %xmm8 403.endif 404 405 movdqu %xmm8, AadHash(%arg2) 406.ifc \operation, enc 407 # GHASH computation for the last <16 byte block 408 movdqa SHUF_MASK(%rip), %xmm10 409 # shuffle xmm0 back to output as ciphertext 410 pshufb %xmm10, %xmm0 411.endif 412 413 # Output %r13 bytes 414 movq %xmm0, %rax 415 cmp $8, %r13 416 jle _less_than_8_bytes_left_\@ 417 mov %rax, (%arg3 , %r11, 1) 418 add $8, %r11 419 psrldq $8, %xmm0 420 movq %xmm0, %rax 421 sub $8, %r13 422_less_than_8_bytes_left_\@: 423 mov %al, (%arg3, %r11, 1) 424 add $1, %r11 425 shr $8, %rax 426 sub $1, %r13 427 jne _less_than_8_bytes_left_\@ 428_multiple_of_16_bytes_\@: 429.endm 430 431# GCM_COMPLETE Finishes update of tag of last partial block 432# Output: Authorization Tag (AUTH_TAG) 433# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 434.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 435 movdqu AadHash(%arg2), %xmm8 436 movdqu HashKey(%arg2), %xmm13 437 438 mov PBlockLen(%arg2), %r12 439 440 cmp $0, %r12 441 je _partial_done\@ 442 443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 444 445_partial_done\@: 446 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 447 shl $3, %r12 # convert into number of bits 448 movd %r12d, %xmm15 # len(A) in %xmm15 449 mov InLen(%arg2), %r12 450 shl $3, %r12 # len(C) in bits (*128) 451 movq %r12, %xmm1 452 453 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 454 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 455 pxor %xmm15, %xmm8 456 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 457 # final GHASH computation 458 movdqa SHUF_MASK(%rip), %xmm10 459 pshufb %xmm10, %xmm8 460 461 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 462 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 463 pxor %xmm8, %xmm0 464_return_T_\@: 465 mov \AUTHTAG, %r10 # %r10 = authTag 466 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 467 cmp $16, %r11 468 je _T_16_\@ 469 cmp $8, %r11 470 jl _T_4_\@ 471_T_8_\@: 472 movq %xmm0, %rax 473 mov %rax, (%r10) 474 add $8, %r10 475 sub $8, %r11 476 psrldq $8, %xmm0 477 cmp $0, %r11 478 je _return_T_done_\@ 479_T_4_\@: 480 movd %xmm0, %eax 481 mov %eax, (%r10) 482 add $4, %r10 483 sub $4, %r11 484 psrldq $4, %xmm0 485 cmp $0, %r11 486 je _return_T_done_\@ 487_T_123_\@: 488 movd %xmm0, %eax 489 cmp $2, %r11 490 jl _T_1_\@ 491 mov %ax, (%r10) 492 cmp $2, %r11 493 je _return_T_done_\@ 494 add $2, %r10 495 sar $16, %eax 496_T_1_\@: 497 mov %al, (%r10) 498 jmp _return_T_done_\@ 499_T_16_\@: 500 movdqu %xmm0, (%r10) 501_return_T_done_\@: 502.endm 503 504#ifdef __x86_64__ 505/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 506* 507* 508* Input: A and B (128-bits each, bit-reflected) 509* Output: C = A*B*x mod poly, (i.e. >>1 ) 510* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 511* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 512* 513*/ 514.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 515 movdqa \GH, \TMP1 516 pshufd $78, \GH, \TMP2 517 pshufd $78, \HK, \TMP3 518 pxor \GH, \TMP2 # TMP2 = a1+a0 519 pxor \HK, \TMP3 # TMP3 = b1+b0 520 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 521 pclmulqdq $0x00, \HK, \GH # GH = a0*b0 522 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 523 pxor \GH, \TMP2 524 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 525 movdqa \TMP2, \TMP3 526 pslldq $8, \TMP3 # left shift TMP3 2 DWs 527 psrldq $8, \TMP2 # right shift TMP2 2 DWs 528 pxor \TMP3, \GH 529 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 530 531 # first phase of the reduction 532 533 movdqa \GH, \TMP2 534 movdqa \GH, \TMP3 535 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 536 # in in order to perform 537 # independent shifts 538 pslld $31, \TMP2 # packed right shift <<31 539 pslld $30, \TMP3 # packed right shift <<30 540 pslld $25, \TMP4 # packed right shift <<25 541 pxor \TMP3, \TMP2 # xor the shifted versions 542 pxor \TMP4, \TMP2 543 movdqa \TMP2, \TMP5 544 psrldq $4, \TMP5 # right shift TMP5 1 DW 545 pslldq $12, \TMP2 # left shift TMP2 3 DWs 546 pxor \TMP2, \GH 547 548 # second phase of the reduction 549 550 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 551 # in in order to perform 552 # independent shifts 553 movdqa \GH,\TMP3 554 movdqa \GH,\TMP4 555 psrld $1,\TMP2 # packed left shift >>1 556 psrld $2,\TMP3 # packed left shift >>2 557 psrld $7,\TMP4 # packed left shift >>7 558 pxor \TMP3,\TMP2 # xor the shifted versions 559 pxor \TMP4,\TMP2 560 pxor \TMP5, \TMP2 561 pxor \TMP2, \GH 562 pxor \TMP1, \GH # result is in TMP1 563.endm 564 565# Reads DLEN bytes starting at DPTR and stores in XMMDst 566# where 0 < DLEN < 16 567# Clobbers %rax, DLEN and XMM1 568.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 569 cmp $8, \DLEN 570 jl _read_lt8_\@ 571 mov (\DPTR), %rax 572 movq %rax, \XMMDst 573 sub $8, \DLEN 574 jz _done_read_partial_block_\@ 575 xor %eax, %eax 576_read_next_byte_\@: 577 shl $8, %rax 578 mov 7(\DPTR, \DLEN, 1), %al 579 dec \DLEN 580 jnz _read_next_byte_\@ 581 movq %rax, \XMM1 582 pslldq $8, \XMM1 583 por \XMM1, \XMMDst 584 jmp _done_read_partial_block_\@ 585_read_lt8_\@: 586 xor %eax, %eax 587_read_next_byte_lt8_\@: 588 shl $8, %rax 589 mov -1(\DPTR, \DLEN, 1), %al 590 dec \DLEN 591 jnz _read_next_byte_lt8_\@ 592 movq %rax, \XMMDst 593_done_read_partial_block_\@: 594.endm 595 596# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 597# clobbers r10-11, xmm14 598.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 599 TMP6 TMP7 600 MOVADQ SHUF_MASK(%rip), %xmm14 601 mov \AAD, %r10 # %r10 = AAD 602 mov \AADLEN, %r11 # %r11 = aadLen 603 pxor \TMP7, \TMP7 604 pxor \TMP6, \TMP6 605 606 cmp $16, %r11 607 jl _get_AAD_rest\@ 608_get_AAD_blocks\@: 609 movdqu (%r10), \TMP7 610 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 611 pxor \TMP7, \TMP6 612 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 613 add $16, %r10 614 sub $16, %r11 615 cmp $16, %r11 616 jge _get_AAD_blocks\@ 617 618 movdqu \TMP6, \TMP7 619 620 /* read the last <16B of AAD */ 621_get_AAD_rest\@: 622 cmp $0, %r11 623 je _get_AAD_done\@ 624 625 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 626 pshufb %xmm14, \TMP7 # byte-reflect the AAD data 627 pxor \TMP6, \TMP7 628 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 629 movdqu \TMP7, \TMP6 630 631_get_AAD_done\@: 632 movdqu \TMP6, AadHash(%arg2) 633.endm 634 635# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 636# between update calls. 637# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 638# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 639# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 640.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 641 AAD_HASH operation 642 mov PBlockLen(%arg2), %r13 643 cmp $0, %r13 644 je _partial_block_done_\@ # Leave Macro if no partial blocks 645 # Read in input data without over reading 646 cmp $16, \PLAIN_CYPH_LEN 647 jl _fewer_than_16_bytes_\@ 648 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 649 jmp _data_read_\@ 650 651_fewer_than_16_bytes_\@: 652 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 653 mov \PLAIN_CYPH_LEN, %r12 654 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 655 656 mov PBlockLen(%arg2), %r13 657 658_data_read_\@: # Finished reading in data 659 660 movdqu PBlockEncKey(%arg2), %xmm9 661 movdqu HashKey(%arg2), %xmm13 662 663 lea SHIFT_MASK(%rip), %r12 664 665 # adjust the shuffle mask pointer to be able to shift r13 bytes 666 # r16-r13 is the number of bytes in plaintext mod 16) 667 add %r13, %r12 668 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 669 pshufb %xmm2, %xmm9 # shift right r13 bytes 670 671.ifc \operation, dec 672 movdqa %xmm1, %xmm3 673 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 674 675 mov \PLAIN_CYPH_LEN, %r10 676 add %r13, %r10 677 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 678 sub $16, %r10 679 # Determine if if partial block is not being filled and 680 # shift mask accordingly 681 jge _no_extra_mask_1_\@ 682 sub %r10, %r12 683_no_extra_mask_1_\@: 684 685 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 686 # get the appropriate mask to mask out bottom r13 bytes of xmm9 687 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 688 689 pand %xmm1, %xmm3 690 movdqa SHUF_MASK(%rip), %xmm10 691 pshufb %xmm10, %xmm3 692 pshufb %xmm2, %xmm3 693 pxor %xmm3, \AAD_HASH 694 695 cmp $0, %r10 696 jl _partial_incomplete_1_\@ 697 698 # GHASH computation for the last <16 Byte block 699 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 700 xor %eax, %eax 701 702 mov %rax, PBlockLen(%arg2) 703 jmp _dec_done_\@ 704_partial_incomplete_1_\@: 705 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 706_dec_done_\@: 707 movdqu \AAD_HASH, AadHash(%arg2) 708.else 709 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 710 711 mov \PLAIN_CYPH_LEN, %r10 712 add %r13, %r10 713 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 714 sub $16, %r10 715 # Determine if if partial block is not being filled and 716 # shift mask accordingly 717 jge _no_extra_mask_2_\@ 718 sub %r10, %r12 719_no_extra_mask_2_\@: 720 721 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 722 # get the appropriate mask to mask out bottom r13 bytes of xmm9 723 pand %xmm1, %xmm9 724 725 movdqa SHUF_MASK(%rip), %xmm1 726 pshufb %xmm1, %xmm9 727 pshufb %xmm2, %xmm9 728 pxor %xmm9, \AAD_HASH 729 730 cmp $0, %r10 731 jl _partial_incomplete_2_\@ 732 733 # GHASH computation for the last <16 Byte block 734 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 735 xor %eax, %eax 736 737 mov %rax, PBlockLen(%arg2) 738 jmp _encode_done_\@ 739_partial_incomplete_2_\@: 740 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 741_encode_done_\@: 742 movdqu \AAD_HASH, AadHash(%arg2) 743 744 movdqa SHUF_MASK(%rip), %xmm10 745 # shuffle xmm9 back to output as ciphertext 746 pshufb %xmm10, %xmm9 747 pshufb %xmm2, %xmm9 748.endif 749 # output encrypted Bytes 750 cmp $0, %r10 751 jl _partial_fill_\@ 752 mov %r13, %r12 753 mov $16, %r13 754 # Set r13 to be the number of bytes to write out 755 sub %r12, %r13 756 jmp _count_set_\@ 757_partial_fill_\@: 758 mov \PLAIN_CYPH_LEN, %r13 759_count_set_\@: 760 movdqa %xmm9, %xmm0 761 movq %xmm0, %rax 762 cmp $8, %r13 763 jle _less_than_8_bytes_left_\@ 764 765 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 766 add $8, \DATA_OFFSET 767 psrldq $8, %xmm0 768 movq %xmm0, %rax 769 sub $8, %r13 770_less_than_8_bytes_left_\@: 771 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 772 add $1, \DATA_OFFSET 773 shr $8, %rax 774 sub $1, %r13 775 jne _less_than_8_bytes_left_\@ 776_partial_block_done_\@: 777.endm # PARTIAL_BLOCK 778 779/* 780* if a = number of total plaintext bytes 781* b = floor(a/16) 782* num_initial_blocks = b mod 4 783* encrypt the initial num_initial_blocks blocks and apply ghash on 784* the ciphertext 785* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 786* are clobbered 787* arg1, %arg2, %arg3 are used as a pointer only, not modified 788*/ 789 790 791.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 792 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 793 MOVADQ SHUF_MASK(%rip), %xmm14 794 795 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 796 797 # start AES for num_initial_blocks blocks 798 799 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 800 801.if (\i == 5) || (\i == 6) || (\i == 7) 802 803 MOVADQ ONE(%RIP),\TMP1 804 MOVADQ 0(%arg1),\TMP2 805.irpc index, \i_seq 806 paddd \TMP1, \XMM0 # INCR Y0 807.ifc \operation, dec 808 movdqa \XMM0, %xmm\index 809.else 810 MOVADQ \XMM0, %xmm\index 811.endif 812 pshufb %xmm14, %xmm\index # perform a 16 byte swap 813 pxor \TMP2, %xmm\index 814.endr 815 lea 0x10(%arg1),%r10 816 mov keysize,%eax 817 shr $2,%eax # 128->4, 192->6, 256->8 818 add $5,%eax # 128->9, 192->11, 256->13 819 820aes_loop_initial_\@: 821 MOVADQ (%r10),\TMP1 822.irpc index, \i_seq 823 aesenc \TMP1, %xmm\index 824.endr 825 add $16,%r10 826 sub $1,%eax 827 jnz aes_loop_initial_\@ 828 829 MOVADQ (%r10), \TMP1 830.irpc index, \i_seq 831 aesenclast \TMP1, %xmm\index # Last Round 832.endr 833.irpc index, \i_seq 834 movdqu (%arg4 , %r11, 1), \TMP1 835 pxor \TMP1, %xmm\index 836 movdqu %xmm\index, (%arg3 , %r11, 1) 837 # write back plaintext/ciphertext for num_initial_blocks 838 add $16, %r11 839 840.ifc \operation, dec 841 movdqa \TMP1, %xmm\index 842.endif 843 pshufb %xmm14, %xmm\index 844 845 # prepare plaintext/ciphertext for GHASH computation 846.endr 847.endif 848 849 # apply GHASH on num_initial_blocks blocks 850 851.if \i == 5 852 pxor %xmm5, %xmm6 853 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 854 pxor %xmm6, %xmm7 855 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 856 pxor %xmm7, %xmm8 857 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 858.elseif \i == 6 859 pxor %xmm6, %xmm7 860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 861 pxor %xmm7, %xmm8 862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 863.elseif \i == 7 864 pxor %xmm7, %xmm8 865 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 866.endif 867 cmp $64, %r13 868 jl _initial_blocks_done\@ 869 # no need for precomputed values 870/* 871* 872* Precomputations for HashKey parallel with encryption of first 4 blocks. 873* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 874*/ 875 MOVADQ ONE(%RIP),\TMP1 876 paddd \TMP1, \XMM0 # INCR Y0 877 MOVADQ \XMM0, \XMM1 878 pshufb %xmm14, \XMM1 # perform a 16 byte swap 879 880 paddd \TMP1, \XMM0 # INCR Y0 881 MOVADQ \XMM0, \XMM2 882 pshufb %xmm14, \XMM2 # perform a 16 byte swap 883 884 paddd \TMP1, \XMM0 # INCR Y0 885 MOVADQ \XMM0, \XMM3 886 pshufb %xmm14, \XMM3 # perform a 16 byte swap 887 888 paddd \TMP1, \XMM0 # INCR Y0 889 MOVADQ \XMM0, \XMM4 890 pshufb %xmm14, \XMM4 # perform a 16 byte swap 891 892 MOVADQ 0(%arg1),\TMP1 893 pxor \TMP1, \XMM1 894 pxor \TMP1, \XMM2 895 pxor \TMP1, \XMM3 896 pxor \TMP1, \XMM4 897.irpc index, 1234 # do 4 rounds 898 movaps 0x10*\index(%arg1), \TMP1 899 aesenc \TMP1, \XMM1 900 aesenc \TMP1, \XMM2 901 aesenc \TMP1, \XMM3 902 aesenc \TMP1, \XMM4 903.endr 904.irpc index, 56789 # do next 5 rounds 905 movaps 0x10*\index(%arg1), \TMP1 906 aesenc \TMP1, \XMM1 907 aesenc \TMP1, \XMM2 908 aesenc \TMP1, \XMM3 909 aesenc \TMP1, \XMM4 910.endr 911 lea 0xa0(%arg1),%r10 912 mov keysize,%eax 913 shr $2,%eax # 128->4, 192->6, 256->8 914 sub $4,%eax # 128->0, 192->2, 256->4 915 jz aes_loop_pre_done\@ 916 917aes_loop_pre_\@: 918 MOVADQ (%r10),\TMP2 919.irpc index, 1234 920 aesenc \TMP2, %xmm\index 921.endr 922 add $16,%r10 923 sub $1,%eax 924 jnz aes_loop_pre_\@ 925 926aes_loop_pre_done\@: 927 MOVADQ (%r10), \TMP2 928 aesenclast \TMP2, \XMM1 929 aesenclast \TMP2, \XMM2 930 aesenclast \TMP2, \XMM3 931 aesenclast \TMP2, \XMM4 932 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 933 pxor \TMP1, \XMM1 934.ifc \operation, dec 935 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 936 movdqa \TMP1, \XMM1 937.endif 938 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 939 pxor \TMP1, \XMM2 940.ifc \operation, dec 941 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 942 movdqa \TMP1, \XMM2 943.endif 944 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 945 pxor \TMP1, \XMM3 946.ifc \operation, dec 947 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 948 movdqa \TMP1, \XMM3 949.endif 950 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 951 pxor \TMP1, \XMM4 952.ifc \operation, dec 953 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 954 movdqa \TMP1, \XMM4 955.else 956 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 957 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 958 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 959 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 960.endif 961 962 add $64, %r11 963 pshufb %xmm14, \XMM1 # perform a 16 byte swap 964 pxor \XMMDst, \XMM1 965# combine GHASHed value with the corresponding ciphertext 966 pshufb %xmm14, \XMM2 # perform a 16 byte swap 967 pshufb %xmm14, \XMM3 # perform a 16 byte swap 968 pshufb %xmm14, \XMM4 # perform a 16 byte swap 969 970_initial_blocks_done\@: 971 972.endm 973 974/* 975* encrypt 4 blocks at a time 976* ghash the 4 previously encrypted ciphertext blocks 977* arg1, %arg3, %arg4 are used as pointers only, not modified 978* %r11 is the data offset value 979*/ 980.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 981TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 982 983 movdqa \XMM1, \XMM5 984 movdqa \XMM2, \XMM6 985 movdqa \XMM3, \XMM7 986 movdqa \XMM4, \XMM8 987 988 movdqa SHUF_MASK(%rip), %xmm15 989 # multiply TMP5 * HashKey using karatsuba 990 991 movdqa \XMM5, \TMP4 992 pshufd $78, \XMM5, \TMP6 993 pxor \XMM5, \TMP6 994 paddd ONE(%rip), \XMM0 # INCR CNT 995 movdqu HashKey_4(%arg2), \TMP5 996 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 997 movdqa \XMM0, \XMM1 998 paddd ONE(%rip), \XMM0 # INCR CNT 999 movdqa \XMM0, \XMM2 1000 paddd ONE(%rip), \XMM0 # INCR CNT 1001 movdqa \XMM0, \XMM3 1002 paddd ONE(%rip), \XMM0 # INCR CNT 1003 movdqa \XMM0, \XMM4 1004 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1005 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1006 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1007 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1008 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1009 1010 pxor (%arg1), \XMM1 1011 pxor (%arg1), \XMM2 1012 pxor (%arg1), \XMM3 1013 pxor (%arg1), \XMM4 1014 movdqu HashKey_4_k(%arg2), \TMP5 1015 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1016 movaps 0x10(%arg1), \TMP1 1017 aesenc \TMP1, \XMM1 # Round 1 1018 aesenc \TMP1, \XMM2 1019 aesenc \TMP1, \XMM3 1020 aesenc \TMP1, \XMM4 1021 movaps 0x20(%arg1), \TMP1 1022 aesenc \TMP1, \XMM1 # Round 2 1023 aesenc \TMP1, \XMM2 1024 aesenc \TMP1, \XMM3 1025 aesenc \TMP1, \XMM4 1026 movdqa \XMM6, \TMP1 1027 pshufd $78, \XMM6, \TMP2 1028 pxor \XMM6, \TMP2 1029 movdqu HashKey_3(%arg2), \TMP5 1030 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1031 movaps 0x30(%arg1), \TMP3 1032 aesenc \TMP3, \XMM1 # Round 3 1033 aesenc \TMP3, \XMM2 1034 aesenc \TMP3, \XMM3 1035 aesenc \TMP3, \XMM4 1036 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1037 movaps 0x40(%arg1), \TMP3 1038 aesenc \TMP3, \XMM1 # Round 4 1039 aesenc \TMP3, \XMM2 1040 aesenc \TMP3, \XMM3 1041 aesenc \TMP3, \XMM4 1042 movdqu HashKey_3_k(%arg2), \TMP5 1043 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1044 movaps 0x50(%arg1), \TMP3 1045 aesenc \TMP3, \XMM1 # Round 5 1046 aesenc \TMP3, \XMM2 1047 aesenc \TMP3, \XMM3 1048 aesenc \TMP3, \XMM4 1049 pxor \TMP1, \TMP4 1050# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1051 pxor \XMM6, \XMM5 1052 pxor \TMP2, \TMP6 1053 movdqa \XMM7, \TMP1 1054 pshufd $78, \XMM7, \TMP2 1055 pxor \XMM7, \TMP2 1056 movdqu HashKey_2(%arg2), \TMP5 1057 1058 # Multiply TMP5 * HashKey using karatsuba 1059 1060 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1061 movaps 0x60(%arg1), \TMP3 1062 aesenc \TMP3, \XMM1 # Round 6 1063 aesenc \TMP3, \XMM2 1064 aesenc \TMP3, \XMM3 1065 aesenc \TMP3, \XMM4 1066 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1067 movaps 0x70(%arg1), \TMP3 1068 aesenc \TMP3, \XMM1 # Round 7 1069 aesenc \TMP3, \XMM2 1070 aesenc \TMP3, \XMM3 1071 aesenc \TMP3, \XMM4 1072 movdqu HashKey_2_k(%arg2), \TMP5 1073 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1074 movaps 0x80(%arg1), \TMP3 1075 aesenc \TMP3, \XMM1 # Round 8 1076 aesenc \TMP3, \XMM2 1077 aesenc \TMP3, \XMM3 1078 aesenc \TMP3, \XMM4 1079 pxor \TMP1, \TMP4 1080# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1081 pxor \XMM7, \XMM5 1082 pxor \TMP2, \TMP6 1083 1084 # Multiply XMM8 * HashKey 1085 # XMM8 and TMP5 hold the values for the two operands 1086 1087 movdqa \XMM8, \TMP1 1088 pshufd $78, \XMM8, \TMP2 1089 pxor \XMM8, \TMP2 1090 movdqu HashKey(%arg2), \TMP5 1091 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1092 movaps 0x90(%arg1), \TMP3 1093 aesenc \TMP3, \XMM1 # Round 9 1094 aesenc \TMP3, \XMM2 1095 aesenc \TMP3, \XMM3 1096 aesenc \TMP3, \XMM4 1097 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1098 lea 0xa0(%arg1),%r10 1099 mov keysize,%eax 1100 shr $2,%eax # 128->4, 192->6, 256->8 1101 sub $4,%eax # 128->0, 192->2, 256->4 1102 jz aes_loop_par_enc_done\@ 1103 1104aes_loop_par_enc\@: 1105 MOVADQ (%r10),\TMP3 1106.irpc index, 1234 1107 aesenc \TMP3, %xmm\index 1108.endr 1109 add $16,%r10 1110 sub $1,%eax 1111 jnz aes_loop_par_enc\@ 1112 1113aes_loop_par_enc_done\@: 1114 MOVADQ (%r10), \TMP3 1115 aesenclast \TMP3, \XMM1 # Round 10 1116 aesenclast \TMP3, \XMM2 1117 aesenclast \TMP3, \XMM3 1118 aesenclast \TMP3, \XMM4 1119 movdqu HashKey_k(%arg2), \TMP5 1120 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1121 movdqu (%arg4,%r11,1), \TMP3 1122 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1123 movdqu 16(%arg4,%r11,1), \TMP3 1124 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1125 movdqu 32(%arg4,%r11,1), \TMP3 1126 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1127 movdqu 48(%arg4,%r11,1), \TMP3 1128 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1129 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1130 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1131 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1132 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1133 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1134 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1135 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1136 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1137 1138 pxor \TMP4, \TMP1 1139 pxor \XMM8, \XMM5 1140 pxor \TMP6, \TMP2 1141 pxor \TMP1, \TMP2 1142 pxor \XMM5, \TMP2 1143 movdqa \TMP2, \TMP3 1144 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1145 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1146 pxor \TMP3, \XMM5 1147 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1148 1149 # first phase of reduction 1150 1151 movdqa \XMM5, \TMP2 1152 movdqa \XMM5, \TMP3 1153 movdqa \XMM5, \TMP4 1154# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1155 pslld $31, \TMP2 # packed right shift << 31 1156 pslld $30, \TMP3 # packed right shift << 30 1157 pslld $25, \TMP4 # packed right shift << 25 1158 pxor \TMP3, \TMP2 # xor the shifted versions 1159 pxor \TMP4, \TMP2 1160 movdqa \TMP2, \TMP5 1161 psrldq $4, \TMP5 # right shift T5 1 DW 1162 pslldq $12, \TMP2 # left shift T2 3 DWs 1163 pxor \TMP2, \XMM5 1164 1165 # second phase of reduction 1166 1167 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1168 movdqa \XMM5,\TMP3 1169 movdqa \XMM5,\TMP4 1170 psrld $1, \TMP2 # packed left shift >>1 1171 psrld $2, \TMP3 # packed left shift >>2 1172 psrld $7, \TMP4 # packed left shift >>7 1173 pxor \TMP3,\TMP2 # xor the shifted versions 1174 pxor \TMP4,\TMP2 1175 pxor \TMP5, \TMP2 1176 pxor \TMP2, \XMM5 1177 pxor \TMP1, \XMM5 # result is in TMP1 1178 1179 pxor \XMM5, \XMM1 1180.endm 1181 1182/* 1183* decrypt 4 blocks at a time 1184* ghash the 4 previously decrypted ciphertext blocks 1185* arg1, %arg3, %arg4 are used as pointers only, not modified 1186* %r11 is the data offset value 1187*/ 1188.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1189TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1190 1191 movdqa \XMM1, \XMM5 1192 movdqa \XMM2, \XMM6 1193 movdqa \XMM3, \XMM7 1194 movdqa \XMM4, \XMM8 1195 1196 movdqa SHUF_MASK(%rip), %xmm15 1197 # multiply TMP5 * HashKey using karatsuba 1198 1199 movdqa \XMM5, \TMP4 1200 pshufd $78, \XMM5, \TMP6 1201 pxor \XMM5, \TMP6 1202 paddd ONE(%rip), \XMM0 # INCR CNT 1203 movdqu HashKey_4(%arg2), \TMP5 1204 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1205 movdqa \XMM0, \XMM1 1206 paddd ONE(%rip), \XMM0 # INCR CNT 1207 movdqa \XMM0, \XMM2 1208 paddd ONE(%rip), \XMM0 # INCR CNT 1209 movdqa \XMM0, \XMM3 1210 paddd ONE(%rip), \XMM0 # INCR CNT 1211 movdqa \XMM0, \XMM4 1212 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1213 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1214 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1215 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1216 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1217 1218 pxor (%arg1), \XMM1 1219 pxor (%arg1), \XMM2 1220 pxor (%arg1), \XMM3 1221 pxor (%arg1), \XMM4 1222 movdqu HashKey_4_k(%arg2), \TMP5 1223 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1224 movaps 0x10(%arg1), \TMP1 1225 aesenc \TMP1, \XMM1 # Round 1 1226 aesenc \TMP1, \XMM2 1227 aesenc \TMP1, \XMM3 1228 aesenc \TMP1, \XMM4 1229 movaps 0x20(%arg1), \TMP1 1230 aesenc \TMP1, \XMM1 # Round 2 1231 aesenc \TMP1, \XMM2 1232 aesenc \TMP1, \XMM3 1233 aesenc \TMP1, \XMM4 1234 movdqa \XMM6, \TMP1 1235 pshufd $78, \XMM6, \TMP2 1236 pxor \XMM6, \TMP2 1237 movdqu HashKey_3(%arg2), \TMP5 1238 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1239 movaps 0x30(%arg1), \TMP3 1240 aesenc \TMP3, \XMM1 # Round 3 1241 aesenc \TMP3, \XMM2 1242 aesenc \TMP3, \XMM3 1243 aesenc \TMP3, \XMM4 1244 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1245 movaps 0x40(%arg1), \TMP3 1246 aesenc \TMP3, \XMM1 # Round 4 1247 aesenc \TMP3, \XMM2 1248 aesenc \TMP3, \XMM3 1249 aesenc \TMP3, \XMM4 1250 movdqu HashKey_3_k(%arg2), \TMP5 1251 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1252 movaps 0x50(%arg1), \TMP3 1253 aesenc \TMP3, \XMM1 # Round 5 1254 aesenc \TMP3, \XMM2 1255 aesenc \TMP3, \XMM3 1256 aesenc \TMP3, \XMM4 1257 pxor \TMP1, \TMP4 1258# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1259 pxor \XMM6, \XMM5 1260 pxor \TMP2, \TMP6 1261 movdqa \XMM7, \TMP1 1262 pshufd $78, \XMM7, \TMP2 1263 pxor \XMM7, \TMP2 1264 movdqu HashKey_2(%arg2), \TMP5 1265 1266 # Multiply TMP5 * HashKey using karatsuba 1267 1268 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1269 movaps 0x60(%arg1), \TMP3 1270 aesenc \TMP3, \XMM1 # Round 6 1271 aesenc \TMP3, \XMM2 1272 aesenc \TMP3, \XMM3 1273 aesenc \TMP3, \XMM4 1274 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1275 movaps 0x70(%arg1), \TMP3 1276 aesenc \TMP3, \XMM1 # Round 7 1277 aesenc \TMP3, \XMM2 1278 aesenc \TMP3, \XMM3 1279 aesenc \TMP3, \XMM4 1280 movdqu HashKey_2_k(%arg2), \TMP5 1281 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1282 movaps 0x80(%arg1), \TMP3 1283 aesenc \TMP3, \XMM1 # Round 8 1284 aesenc \TMP3, \XMM2 1285 aesenc \TMP3, \XMM3 1286 aesenc \TMP3, \XMM4 1287 pxor \TMP1, \TMP4 1288# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1289 pxor \XMM7, \XMM5 1290 pxor \TMP2, \TMP6 1291 1292 # Multiply XMM8 * HashKey 1293 # XMM8 and TMP5 hold the values for the two operands 1294 1295 movdqa \XMM8, \TMP1 1296 pshufd $78, \XMM8, \TMP2 1297 pxor \XMM8, \TMP2 1298 movdqu HashKey(%arg2), \TMP5 1299 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1300 movaps 0x90(%arg1), \TMP3 1301 aesenc \TMP3, \XMM1 # Round 9 1302 aesenc \TMP3, \XMM2 1303 aesenc \TMP3, \XMM3 1304 aesenc \TMP3, \XMM4 1305 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1306 lea 0xa0(%arg1),%r10 1307 mov keysize,%eax 1308 shr $2,%eax # 128->4, 192->6, 256->8 1309 sub $4,%eax # 128->0, 192->2, 256->4 1310 jz aes_loop_par_dec_done\@ 1311 1312aes_loop_par_dec\@: 1313 MOVADQ (%r10),\TMP3 1314.irpc index, 1234 1315 aesenc \TMP3, %xmm\index 1316.endr 1317 add $16,%r10 1318 sub $1,%eax 1319 jnz aes_loop_par_dec\@ 1320 1321aes_loop_par_dec_done\@: 1322 MOVADQ (%r10), \TMP3 1323 aesenclast \TMP3, \XMM1 # last round 1324 aesenclast \TMP3, \XMM2 1325 aesenclast \TMP3, \XMM3 1326 aesenclast \TMP3, \XMM4 1327 movdqu HashKey_k(%arg2), \TMP5 1328 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1329 movdqu (%arg4,%r11,1), \TMP3 1330 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1331 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1332 movdqa \TMP3, \XMM1 1333 movdqu 16(%arg4,%r11,1), \TMP3 1334 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1335 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1336 movdqa \TMP3, \XMM2 1337 movdqu 32(%arg4,%r11,1), \TMP3 1338 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1339 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1340 movdqa \TMP3, \XMM3 1341 movdqu 48(%arg4,%r11,1), \TMP3 1342 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1343 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1344 movdqa \TMP3, \XMM4 1345 pshufb %xmm15, \XMM1 # perform a 16 byte swap 1346 pshufb %xmm15, \XMM2 # perform a 16 byte swap 1347 pshufb %xmm15, \XMM3 # perform a 16 byte swap 1348 pshufb %xmm15, \XMM4 # perform a 16 byte swap 1349 1350 pxor \TMP4, \TMP1 1351 pxor \XMM8, \XMM5 1352 pxor \TMP6, \TMP2 1353 pxor \TMP1, \TMP2 1354 pxor \XMM5, \TMP2 1355 movdqa \TMP2, \TMP3 1356 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1357 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1358 pxor \TMP3, \XMM5 1359 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1360 1361 # first phase of reduction 1362 1363 movdqa \XMM5, \TMP2 1364 movdqa \XMM5, \TMP3 1365 movdqa \XMM5, \TMP4 1366# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1367 pslld $31, \TMP2 # packed right shift << 31 1368 pslld $30, \TMP3 # packed right shift << 30 1369 pslld $25, \TMP4 # packed right shift << 25 1370 pxor \TMP3, \TMP2 # xor the shifted versions 1371 pxor \TMP4, \TMP2 1372 movdqa \TMP2, \TMP5 1373 psrldq $4, \TMP5 # right shift T5 1 DW 1374 pslldq $12, \TMP2 # left shift T2 3 DWs 1375 pxor \TMP2, \XMM5 1376 1377 # second phase of reduction 1378 1379 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1380 movdqa \XMM5,\TMP3 1381 movdqa \XMM5,\TMP4 1382 psrld $1, \TMP2 # packed left shift >>1 1383 psrld $2, \TMP3 # packed left shift >>2 1384 psrld $7, \TMP4 # packed left shift >>7 1385 pxor \TMP3,\TMP2 # xor the shifted versions 1386 pxor \TMP4,\TMP2 1387 pxor \TMP5, \TMP2 1388 pxor \TMP2, \XMM5 1389 pxor \TMP1, \XMM5 # result is in TMP1 1390 1391 pxor \XMM5, \XMM1 1392.endm 1393 1394/* GHASH the last 4 ciphertext blocks. */ 1395.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1396TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1397 1398 # Multiply TMP6 * HashKey (using Karatsuba) 1399 1400 movdqa \XMM1, \TMP6 1401 pshufd $78, \XMM1, \TMP2 1402 pxor \XMM1, \TMP2 1403 movdqu HashKey_4(%arg2), \TMP5 1404 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1405 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1406 movdqu HashKey_4_k(%arg2), \TMP4 1407 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1408 movdqa \XMM1, \XMMDst 1409 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1410 1411 # Multiply TMP1 * HashKey (using Karatsuba) 1412 1413 movdqa \XMM2, \TMP1 1414 pshufd $78, \XMM2, \TMP2 1415 pxor \XMM2, \TMP2 1416 movdqu HashKey_3(%arg2), \TMP5 1417 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1418 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1419 movdqu HashKey_3_k(%arg2), \TMP4 1420 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1421 pxor \TMP1, \TMP6 1422 pxor \XMM2, \XMMDst 1423 pxor \TMP2, \XMM1 1424# results accumulated in TMP6, XMMDst, XMM1 1425 1426 # Multiply TMP1 * HashKey (using Karatsuba) 1427 1428 movdqa \XMM3, \TMP1 1429 pshufd $78, \XMM3, \TMP2 1430 pxor \XMM3, \TMP2 1431 movdqu HashKey_2(%arg2), \TMP5 1432 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1433 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1434 movdqu HashKey_2_k(%arg2), \TMP4 1435 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1436 pxor \TMP1, \TMP6 1437 pxor \XMM3, \XMMDst 1438 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1439 1440 # Multiply TMP1 * HashKey (using Karatsuba) 1441 movdqa \XMM4, \TMP1 1442 pshufd $78, \XMM4, \TMP2 1443 pxor \XMM4, \TMP2 1444 movdqu HashKey(%arg2), \TMP5 1445 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1446 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1447 movdqu HashKey_k(%arg2), \TMP4 1448 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1449 pxor \TMP1, \TMP6 1450 pxor \XMM4, \XMMDst 1451 pxor \XMM1, \TMP2 1452 pxor \TMP6, \TMP2 1453 pxor \XMMDst, \TMP2 1454 # middle section of the temp results combined as in karatsuba algorithm 1455 movdqa \TMP2, \TMP4 1456 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1457 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1458 pxor \TMP4, \XMMDst 1459 pxor \TMP2, \TMP6 1460# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1461 # first phase of the reduction 1462 movdqa \XMMDst, \TMP2 1463 movdqa \XMMDst, \TMP3 1464 movdqa \XMMDst, \TMP4 1465# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1466 pslld $31, \TMP2 # packed right shifting << 31 1467 pslld $30, \TMP3 # packed right shifting << 30 1468 pslld $25, \TMP4 # packed right shifting << 25 1469 pxor \TMP3, \TMP2 # xor the shifted versions 1470 pxor \TMP4, \TMP2 1471 movdqa \TMP2, \TMP7 1472 psrldq $4, \TMP7 # right shift TMP7 1 DW 1473 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1474 pxor \TMP2, \XMMDst 1475 1476 # second phase of the reduction 1477 movdqa \XMMDst, \TMP2 1478 # make 3 copies of XMMDst for doing 3 shift operations 1479 movdqa \XMMDst, \TMP3 1480 movdqa \XMMDst, \TMP4 1481 psrld $1, \TMP2 # packed left shift >> 1 1482 psrld $2, \TMP3 # packed left shift >> 2 1483 psrld $7, \TMP4 # packed left shift >> 7 1484 pxor \TMP3, \TMP2 # xor the shifted versions 1485 pxor \TMP4, \TMP2 1486 pxor \TMP7, \TMP2 1487 pxor \TMP2, \XMMDst 1488 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1489.endm 1490 1491 1492/* Encryption of a single block 1493* uses eax & r10 1494*/ 1495 1496.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1497 1498 pxor (%arg1), \XMM0 1499 mov keysize,%eax 1500 shr $2,%eax # 128->4, 192->6, 256->8 1501 add $5,%eax # 128->9, 192->11, 256->13 1502 lea 16(%arg1), %r10 # get first expanded key address 1503 1504_esb_loop_\@: 1505 MOVADQ (%r10),\TMP1 1506 aesenc \TMP1,\XMM0 1507 add $16,%r10 1508 sub $1,%eax 1509 jnz _esb_loop_\@ 1510 1511 MOVADQ (%r10),\TMP1 1512 aesenclast \TMP1,\XMM0 1513.endm 1514/***************************************************************************** 1515* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1516* struct gcm_context_data *data 1517* // Context data 1518* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1519* const u8 *in, // Ciphertext input 1520* u64 plaintext_len, // Length of data in bytes for decryption. 1521* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1522* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1523* // concatenated with 0x00000001. 16-byte aligned pointer. 1524* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1525* const u8 *aad, // Additional Authentication Data (AAD) 1526* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1527* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1528* // given authentication tag and only return the plaintext if they match. 1529* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1530* // (most likely), 12 or 8. 1531* 1532* Assumptions: 1533* 1534* keys: 1535* keys are pre-expanded and aligned to 16 bytes. we are using the first 1536* set of 11 keys in the data structure void *aes_ctx 1537* 1538* iv: 1539* 0 1 2 3 1540* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1541* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1542* | Salt (From the SA) | 1543* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1544* | Initialization Vector | 1545* | (This is the sequence number from IPSec header) | 1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1547* | 0x1 | 1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1549* 1550* 1551* 1552* AAD: 1553* AAD padded to 128 bits with 0 1554* for example, assume AAD is a u32 vector 1555* 1556* if AAD is 8 bytes: 1557* AAD[3] = {A0, A1}; 1558* padded AAD in xmm register = {A1 A0 0 0} 1559* 1560* 0 1 2 3 1561* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1562* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1563* | SPI (A1) | 1564* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1565* | 32-bit Sequence Number (A0) | 1566* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1567* | 0x0 | 1568* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1569* 1570* AAD Format with 32-bit Sequence Number 1571* 1572* if AAD is 12 bytes: 1573* AAD[3] = {A0, A1, A2}; 1574* padded AAD in xmm register = {A2 A1 A0 0} 1575* 1576* 0 1 2 3 1577* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1578* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1579* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1580* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1581* | SPI (A2) | 1582* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1583* | 64-bit Extended Sequence Number {A1,A0} | 1584* | | 1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1586* | 0x0 | 1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1588* 1589* AAD Format with 64-bit Extended Sequence Number 1590* 1591* poly = x^128 + x^127 + x^126 + x^121 + 1 1592* 1593*****************************************************************************/ 1594SYM_FUNC_START(aesni_gcm_dec) 1595 FUNC_SAVE 1596 1597 GCM_INIT %arg6, arg7, arg8, arg9 1598 GCM_ENC_DEC dec 1599 GCM_COMPLETE arg10, arg11 1600 FUNC_RESTORE 1601 ret 1602SYM_FUNC_END(aesni_gcm_dec) 1603 1604 1605/***************************************************************************** 1606* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1607* struct gcm_context_data *data 1608* // Context data 1609* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1610* const u8 *in, // Plaintext input 1611* u64 plaintext_len, // Length of data in bytes for encryption. 1612* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1613* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1614* // concatenated with 0x00000001. 16-byte aligned pointer. 1615* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1616* const u8 *aad, // Additional Authentication Data (AAD) 1617* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1618* u8 *auth_tag, // Authenticated Tag output. 1619* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1620* // 12 or 8. 1621* 1622* Assumptions: 1623* 1624* keys: 1625* keys are pre-expanded and aligned to 16 bytes. we are using the 1626* first set of 11 keys in the data structure void *aes_ctx 1627* 1628* 1629* iv: 1630* 0 1 2 3 1631* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1632* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1633* | Salt (From the SA) | 1634* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1635* | Initialization Vector | 1636* | (This is the sequence number from IPSec header) | 1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1638* | 0x1 | 1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1640* 1641* 1642* 1643* AAD: 1644* AAD padded to 128 bits with 0 1645* for example, assume AAD is a u32 vector 1646* 1647* if AAD is 8 bytes: 1648* AAD[3] = {A0, A1}; 1649* padded AAD in xmm register = {A1 A0 0 0} 1650* 1651* 0 1 2 3 1652* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1653* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1654* | SPI (A1) | 1655* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1656* | 32-bit Sequence Number (A0) | 1657* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1658* | 0x0 | 1659* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1660* 1661* AAD Format with 32-bit Sequence Number 1662* 1663* if AAD is 12 bytes: 1664* AAD[3] = {A0, A1, A2}; 1665* padded AAD in xmm register = {A2 A1 A0 0} 1666* 1667* 0 1 2 3 1668* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1669* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1670* | SPI (A2) | 1671* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1672* | 64-bit Extended Sequence Number {A1,A0} | 1673* | | 1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1675* | 0x0 | 1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1677* 1678* AAD Format with 64-bit Extended Sequence Number 1679* 1680* poly = x^128 + x^127 + x^126 + x^121 + 1 1681***************************************************************************/ 1682SYM_FUNC_START(aesni_gcm_enc) 1683 FUNC_SAVE 1684 1685 GCM_INIT %arg6, arg7, arg8, arg9 1686 GCM_ENC_DEC enc 1687 1688 GCM_COMPLETE arg10, arg11 1689 FUNC_RESTORE 1690 ret 1691SYM_FUNC_END(aesni_gcm_enc) 1692 1693/***************************************************************************** 1694* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1695* struct gcm_context_data *data, 1696* // context data 1697* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1698* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1699* // concatenated with 0x00000001. 16-byte aligned pointer. 1700* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1701* const u8 *aad, // Additional Authentication Data (AAD) 1702* u64 aad_len) // Length of AAD in bytes. 1703*/ 1704SYM_FUNC_START(aesni_gcm_init) 1705 FUNC_SAVE 1706 GCM_INIT %arg3, %arg4,%arg5, %arg6 1707 FUNC_RESTORE 1708 ret 1709SYM_FUNC_END(aesni_gcm_init) 1710 1711/***************************************************************************** 1712* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1713* struct gcm_context_data *data, 1714* // context data 1715* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1716* const u8 *in, // Plaintext input 1717* u64 plaintext_len, // Length of data in bytes for encryption. 1718*/ 1719SYM_FUNC_START(aesni_gcm_enc_update) 1720 FUNC_SAVE 1721 GCM_ENC_DEC enc 1722 FUNC_RESTORE 1723 ret 1724SYM_FUNC_END(aesni_gcm_enc_update) 1725 1726/***************************************************************************** 1727* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1728* struct gcm_context_data *data, 1729* // context data 1730* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1731* const u8 *in, // Plaintext input 1732* u64 plaintext_len, // Length of data in bytes for encryption. 1733*/ 1734SYM_FUNC_START(aesni_gcm_dec_update) 1735 FUNC_SAVE 1736 GCM_ENC_DEC dec 1737 FUNC_RESTORE 1738 ret 1739SYM_FUNC_END(aesni_gcm_dec_update) 1740 1741/***************************************************************************** 1742* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1743* struct gcm_context_data *data, 1744* // context data 1745* u8 *auth_tag, // Authenticated Tag output. 1746* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1747* // 12 or 8. 1748*/ 1749SYM_FUNC_START(aesni_gcm_finalize) 1750 FUNC_SAVE 1751 GCM_COMPLETE %arg3 %arg4 1752 FUNC_RESTORE 1753 ret 1754SYM_FUNC_END(aesni_gcm_finalize) 1755 1756#endif 1757 1758 1759SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) 1760SYM_FUNC_START_LOCAL(_key_expansion_256a) 1761 pshufd $0b11111111, %xmm1, %xmm1 1762 shufps $0b00010000, %xmm0, %xmm4 1763 pxor %xmm4, %xmm0 1764 shufps $0b10001100, %xmm0, %xmm4 1765 pxor %xmm4, %xmm0 1766 pxor %xmm1, %xmm0 1767 movaps %xmm0, (TKEYP) 1768 add $0x10, TKEYP 1769 ret 1770SYM_FUNC_END(_key_expansion_256a) 1771SYM_FUNC_END_ALIAS(_key_expansion_128) 1772 1773SYM_FUNC_START_LOCAL(_key_expansion_192a) 1774 pshufd $0b01010101, %xmm1, %xmm1 1775 shufps $0b00010000, %xmm0, %xmm4 1776 pxor %xmm4, %xmm0 1777 shufps $0b10001100, %xmm0, %xmm4 1778 pxor %xmm4, %xmm0 1779 pxor %xmm1, %xmm0 1780 1781 movaps %xmm2, %xmm5 1782 movaps %xmm2, %xmm6 1783 pslldq $4, %xmm5 1784 pshufd $0b11111111, %xmm0, %xmm3 1785 pxor %xmm3, %xmm2 1786 pxor %xmm5, %xmm2 1787 1788 movaps %xmm0, %xmm1 1789 shufps $0b01000100, %xmm0, %xmm6 1790 movaps %xmm6, (TKEYP) 1791 shufps $0b01001110, %xmm2, %xmm1 1792 movaps %xmm1, 0x10(TKEYP) 1793 add $0x20, TKEYP 1794 ret 1795SYM_FUNC_END(_key_expansion_192a) 1796 1797SYM_FUNC_START_LOCAL(_key_expansion_192b) 1798 pshufd $0b01010101, %xmm1, %xmm1 1799 shufps $0b00010000, %xmm0, %xmm4 1800 pxor %xmm4, %xmm0 1801 shufps $0b10001100, %xmm0, %xmm4 1802 pxor %xmm4, %xmm0 1803 pxor %xmm1, %xmm0 1804 1805 movaps %xmm2, %xmm5 1806 pslldq $4, %xmm5 1807 pshufd $0b11111111, %xmm0, %xmm3 1808 pxor %xmm3, %xmm2 1809 pxor %xmm5, %xmm2 1810 1811 movaps %xmm0, (TKEYP) 1812 add $0x10, TKEYP 1813 ret 1814SYM_FUNC_END(_key_expansion_192b) 1815 1816SYM_FUNC_START_LOCAL(_key_expansion_256b) 1817 pshufd $0b10101010, %xmm1, %xmm1 1818 shufps $0b00010000, %xmm2, %xmm4 1819 pxor %xmm4, %xmm2 1820 shufps $0b10001100, %xmm2, %xmm4 1821 pxor %xmm4, %xmm2 1822 pxor %xmm1, %xmm2 1823 movaps %xmm2, (TKEYP) 1824 add $0x10, TKEYP 1825 ret 1826SYM_FUNC_END(_key_expansion_256b) 1827 1828/* 1829 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1830 * unsigned int key_len) 1831 */ 1832SYM_FUNC_START(aesni_set_key) 1833 FRAME_BEGIN 1834#ifndef __x86_64__ 1835 pushl KEYP 1836 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1837 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1838 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1839#endif 1840 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1841 movaps %xmm0, (KEYP) 1842 lea 0x10(KEYP), TKEYP # key addr 1843 movl %edx, 480(KEYP) 1844 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1845 cmp $24, %dl 1846 jb .Lenc_key128 1847 je .Lenc_key192 1848 movups 0x10(UKEYP), %xmm2 # other user key 1849 movaps %xmm2, (TKEYP) 1850 add $0x10, TKEYP 1851 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1852 call _key_expansion_256a 1853 aeskeygenassist $0x1, %xmm0, %xmm1 1854 call _key_expansion_256b 1855 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1856 call _key_expansion_256a 1857 aeskeygenassist $0x2, %xmm0, %xmm1 1858 call _key_expansion_256b 1859 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1860 call _key_expansion_256a 1861 aeskeygenassist $0x4, %xmm0, %xmm1 1862 call _key_expansion_256b 1863 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1864 call _key_expansion_256a 1865 aeskeygenassist $0x8, %xmm0, %xmm1 1866 call _key_expansion_256b 1867 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1868 call _key_expansion_256a 1869 aeskeygenassist $0x10, %xmm0, %xmm1 1870 call _key_expansion_256b 1871 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1872 call _key_expansion_256a 1873 aeskeygenassist $0x20, %xmm0, %xmm1 1874 call _key_expansion_256b 1875 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1876 call _key_expansion_256a 1877 jmp .Ldec_key 1878.Lenc_key192: 1879 movq 0x10(UKEYP), %xmm2 # other user key 1880 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1881 call _key_expansion_192a 1882 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1883 call _key_expansion_192b 1884 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1885 call _key_expansion_192a 1886 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1887 call _key_expansion_192b 1888 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1889 call _key_expansion_192a 1890 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1891 call _key_expansion_192b 1892 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1893 call _key_expansion_192a 1894 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 1895 call _key_expansion_192b 1896 jmp .Ldec_key 1897.Lenc_key128: 1898 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 1899 call _key_expansion_128 1900 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 1901 call _key_expansion_128 1902 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 1903 call _key_expansion_128 1904 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 1905 call _key_expansion_128 1906 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 1907 call _key_expansion_128 1908 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 1909 call _key_expansion_128 1910 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 1911 call _key_expansion_128 1912 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 1913 call _key_expansion_128 1914 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 1915 call _key_expansion_128 1916 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 1917 call _key_expansion_128 1918.Ldec_key: 1919 sub $0x10, TKEYP 1920 movaps (KEYP), %xmm0 1921 movaps (TKEYP), %xmm1 1922 movaps %xmm0, 240(TKEYP) 1923 movaps %xmm1, 240(KEYP) 1924 add $0x10, KEYP 1925 lea 240-16(TKEYP), UKEYP 1926.align 4 1927.Ldec_key_loop: 1928 movaps (KEYP), %xmm0 1929 aesimc %xmm0, %xmm1 1930 movaps %xmm1, (UKEYP) 1931 add $0x10, KEYP 1932 sub $0x10, UKEYP 1933 cmp TKEYP, KEYP 1934 jb .Ldec_key_loop 1935 xor AREG, AREG 1936#ifndef __x86_64__ 1937 popl KEYP 1938#endif 1939 FRAME_END 1940 ret 1941SYM_FUNC_END(aesni_set_key) 1942 1943/* 1944 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1945 */ 1946SYM_FUNC_START(aesni_enc) 1947 FRAME_BEGIN 1948#ifndef __x86_64__ 1949 pushl KEYP 1950 pushl KLEN 1951 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1952 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1953 movl (FRAME_OFFSET+20)(%esp), INP # src 1954#endif 1955 movl 480(KEYP), KLEN # key length 1956 movups (INP), STATE # input 1957 call _aesni_enc1 1958 movups STATE, (OUTP) # output 1959#ifndef __x86_64__ 1960 popl KLEN 1961 popl KEYP 1962#endif 1963 FRAME_END 1964 ret 1965SYM_FUNC_END(aesni_enc) 1966 1967/* 1968 * _aesni_enc1: internal ABI 1969 * input: 1970 * KEYP: key struct pointer 1971 * KLEN: round count 1972 * STATE: initial state (input) 1973 * output: 1974 * STATE: finial state (output) 1975 * changed: 1976 * KEY 1977 * TKEYP (T1) 1978 */ 1979SYM_FUNC_START_LOCAL(_aesni_enc1) 1980 movaps (KEYP), KEY # key 1981 mov KEYP, TKEYP 1982 pxor KEY, STATE # round 0 1983 add $0x30, TKEYP 1984 cmp $24, KLEN 1985 jb .Lenc128 1986 lea 0x20(TKEYP), TKEYP 1987 je .Lenc192 1988 add $0x20, TKEYP 1989 movaps -0x60(TKEYP), KEY 1990 aesenc KEY, STATE 1991 movaps -0x50(TKEYP), KEY 1992 aesenc KEY, STATE 1993.align 4 1994.Lenc192: 1995 movaps -0x40(TKEYP), KEY 1996 aesenc KEY, STATE 1997 movaps -0x30(TKEYP), KEY 1998 aesenc KEY, STATE 1999.align 4 2000.Lenc128: 2001 movaps -0x20(TKEYP), KEY 2002 aesenc KEY, STATE 2003 movaps -0x10(TKEYP), KEY 2004 aesenc KEY, STATE 2005 movaps (TKEYP), KEY 2006 aesenc KEY, STATE 2007 movaps 0x10(TKEYP), KEY 2008 aesenc KEY, STATE 2009 movaps 0x20(TKEYP), KEY 2010 aesenc KEY, STATE 2011 movaps 0x30(TKEYP), KEY 2012 aesenc KEY, STATE 2013 movaps 0x40(TKEYP), KEY 2014 aesenc KEY, STATE 2015 movaps 0x50(TKEYP), KEY 2016 aesenc KEY, STATE 2017 movaps 0x60(TKEYP), KEY 2018 aesenc KEY, STATE 2019 movaps 0x70(TKEYP), KEY 2020 aesenclast KEY, STATE 2021 ret 2022SYM_FUNC_END(_aesni_enc1) 2023 2024/* 2025 * _aesni_enc4: internal ABI 2026 * input: 2027 * KEYP: key struct pointer 2028 * KLEN: round count 2029 * STATE1: initial state (input) 2030 * STATE2 2031 * STATE3 2032 * STATE4 2033 * output: 2034 * STATE1: finial state (output) 2035 * STATE2 2036 * STATE3 2037 * STATE4 2038 * changed: 2039 * KEY 2040 * TKEYP (T1) 2041 */ 2042SYM_FUNC_START_LOCAL(_aesni_enc4) 2043 movaps (KEYP), KEY # key 2044 mov KEYP, TKEYP 2045 pxor KEY, STATE1 # round 0 2046 pxor KEY, STATE2 2047 pxor KEY, STATE3 2048 pxor KEY, STATE4 2049 add $0x30, TKEYP 2050 cmp $24, KLEN 2051 jb .L4enc128 2052 lea 0x20(TKEYP), TKEYP 2053 je .L4enc192 2054 add $0x20, TKEYP 2055 movaps -0x60(TKEYP), KEY 2056 aesenc KEY, STATE1 2057 aesenc KEY, STATE2 2058 aesenc KEY, STATE3 2059 aesenc KEY, STATE4 2060 movaps -0x50(TKEYP), KEY 2061 aesenc KEY, STATE1 2062 aesenc KEY, STATE2 2063 aesenc KEY, STATE3 2064 aesenc KEY, STATE4 2065#.align 4 2066.L4enc192: 2067 movaps -0x40(TKEYP), KEY 2068 aesenc KEY, STATE1 2069 aesenc KEY, STATE2 2070 aesenc KEY, STATE3 2071 aesenc KEY, STATE4 2072 movaps -0x30(TKEYP), KEY 2073 aesenc KEY, STATE1 2074 aesenc KEY, STATE2 2075 aesenc KEY, STATE3 2076 aesenc KEY, STATE4 2077#.align 4 2078.L4enc128: 2079 movaps -0x20(TKEYP), KEY 2080 aesenc KEY, STATE1 2081 aesenc KEY, STATE2 2082 aesenc KEY, STATE3 2083 aesenc KEY, STATE4 2084 movaps -0x10(TKEYP), KEY 2085 aesenc KEY, STATE1 2086 aesenc KEY, STATE2 2087 aesenc KEY, STATE3 2088 aesenc KEY, STATE4 2089 movaps (TKEYP), KEY 2090 aesenc KEY, STATE1 2091 aesenc KEY, STATE2 2092 aesenc KEY, STATE3 2093 aesenc KEY, STATE4 2094 movaps 0x10(TKEYP), KEY 2095 aesenc KEY, STATE1 2096 aesenc KEY, STATE2 2097 aesenc KEY, STATE3 2098 aesenc KEY, STATE4 2099 movaps 0x20(TKEYP), KEY 2100 aesenc KEY, STATE1 2101 aesenc KEY, STATE2 2102 aesenc KEY, STATE3 2103 aesenc KEY, STATE4 2104 movaps 0x30(TKEYP), KEY 2105 aesenc KEY, STATE1 2106 aesenc KEY, STATE2 2107 aesenc KEY, STATE3 2108 aesenc KEY, STATE4 2109 movaps 0x40(TKEYP), KEY 2110 aesenc KEY, STATE1 2111 aesenc KEY, STATE2 2112 aesenc KEY, STATE3 2113 aesenc KEY, STATE4 2114 movaps 0x50(TKEYP), KEY 2115 aesenc KEY, STATE1 2116 aesenc KEY, STATE2 2117 aesenc KEY, STATE3 2118 aesenc KEY, STATE4 2119 movaps 0x60(TKEYP), KEY 2120 aesenc KEY, STATE1 2121 aesenc KEY, STATE2 2122 aesenc KEY, STATE3 2123 aesenc KEY, STATE4 2124 movaps 0x70(TKEYP), KEY 2125 aesenclast KEY, STATE1 # last round 2126 aesenclast KEY, STATE2 2127 aesenclast KEY, STATE3 2128 aesenclast KEY, STATE4 2129 ret 2130SYM_FUNC_END(_aesni_enc4) 2131 2132/* 2133 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 2134 */ 2135SYM_FUNC_START(aesni_dec) 2136 FRAME_BEGIN 2137#ifndef __x86_64__ 2138 pushl KEYP 2139 pushl KLEN 2140 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2141 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2142 movl (FRAME_OFFSET+20)(%esp), INP # src 2143#endif 2144 mov 480(KEYP), KLEN # key length 2145 add $240, KEYP 2146 movups (INP), STATE # input 2147 call _aesni_dec1 2148 movups STATE, (OUTP) #output 2149#ifndef __x86_64__ 2150 popl KLEN 2151 popl KEYP 2152#endif 2153 FRAME_END 2154 ret 2155SYM_FUNC_END(aesni_dec) 2156 2157/* 2158 * _aesni_dec1: internal ABI 2159 * input: 2160 * KEYP: key struct pointer 2161 * KLEN: key length 2162 * STATE: initial state (input) 2163 * output: 2164 * STATE: finial state (output) 2165 * changed: 2166 * KEY 2167 * TKEYP (T1) 2168 */ 2169SYM_FUNC_START_LOCAL(_aesni_dec1) 2170 movaps (KEYP), KEY # key 2171 mov KEYP, TKEYP 2172 pxor KEY, STATE # round 0 2173 add $0x30, TKEYP 2174 cmp $24, KLEN 2175 jb .Ldec128 2176 lea 0x20(TKEYP), TKEYP 2177 je .Ldec192 2178 add $0x20, TKEYP 2179 movaps -0x60(TKEYP), KEY 2180 aesdec KEY, STATE 2181 movaps -0x50(TKEYP), KEY 2182 aesdec KEY, STATE 2183.align 4 2184.Ldec192: 2185 movaps -0x40(TKEYP), KEY 2186 aesdec KEY, STATE 2187 movaps -0x30(TKEYP), KEY 2188 aesdec KEY, STATE 2189.align 4 2190.Ldec128: 2191 movaps -0x20(TKEYP), KEY 2192 aesdec KEY, STATE 2193 movaps -0x10(TKEYP), KEY 2194 aesdec KEY, STATE 2195 movaps (TKEYP), KEY 2196 aesdec KEY, STATE 2197 movaps 0x10(TKEYP), KEY 2198 aesdec KEY, STATE 2199 movaps 0x20(TKEYP), KEY 2200 aesdec KEY, STATE 2201 movaps 0x30(TKEYP), KEY 2202 aesdec KEY, STATE 2203 movaps 0x40(TKEYP), KEY 2204 aesdec KEY, STATE 2205 movaps 0x50(TKEYP), KEY 2206 aesdec KEY, STATE 2207 movaps 0x60(TKEYP), KEY 2208 aesdec KEY, STATE 2209 movaps 0x70(TKEYP), KEY 2210 aesdeclast KEY, STATE 2211 ret 2212SYM_FUNC_END(_aesni_dec1) 2213 2214/* 2215 * _aesni_dec4: internal ABI 2216 * input: 2217 * KEYP: key struct pointer 2218 * KLEN: key length 2219 * STATE1: initial state (input) 2220 * STATE2 2221 * STATE3 2222 * STATE4 2223 * output: 2224 * STATE1: finial state (output) 2225 * STATE2 2226 * STATE3 2227 * STATE4 2228 * changed: 2229 * KEY 2230 * TKEYP (T1) 2231 */ 2232SYM_FUNC_START_LOCAL(_aesni_dec4) 2233 movaps (KEYP), KEY # key 2234 mov KEYP, TKEYP 2235 pxor KEY, STATE1 # round 0 2236 pxor KEY, STATE2 2237 pxor KEY, STATE3 2238 pxor KEY, STATE4 2239 add $0x30, TKEYP 2240 cmp $24, KLEN 2241 jb .L4dec128 2242 lea 0x20(TKEYP), TKEYP 2243 je .L4dec192 2244 add $0x20, TKEYP 2245 movaps -0x60(TKEYP), KEY 2246 aesdec KEY, STATE1 2247 aesdec KEY, STATE2 2248 aesdec KEY, STATE3 2249 aesdec KEY, STATE4 2250 movaps -0x50(TKEYP), KEY 2251 aesdec KEY, STATE1 2252 aesdec KEY, STATE2 2253 aesdec KEY, STATE3 2254 aesdec KEY, STATE4 2255.align 4 2256.L4dec192: 2257 movaps -0x40(TKEYP), KEY 2258 aesdec KEY, STATE1 2259 aesdec KEY, STATE2 2260 aesdec KEY, STATE3 2261 aesdec KEY, STATE4 2262 movaps -0x30(TKEYP), KEY 2263 aesdec KEY, STATE1 2264 aesdec KEY, STATE2 2265 aesdec KEY, STATE3 2266 aesdec KEY, STATE4 2267.align 4 2268.L4dec128: 2269 movaps -0x20(TKEYP), KEY 2270 aesdec KEY, STATE1 2271 aesdec KEY, STATE2 2272 aesdec KEY, STATE3 2273 aesdec KEY, STATE4 2274 movaps -0x10(TKEYP), KEY 2275 aesdec KEY, STATE1 2276 aesdec KEY, STATE2 2277 aesdec KEY, STATE3 2278 aesdec KEY, STATE4 2279 movaps (TKEYP), KEY 2280 aesdec KEY, STATE1 2281 aesdec KEY, STATE2 2282 aesdec KEY, STATE3 2283 aesdec KEY, STATE4 2284 movaps 0x10(TKEYP), KEY 2285 aesdec KEY, STATE1 2286 aesdec KEY, STATE2 2287 aesdec KEY, STATE3 2288 aesdec KEY, STATE4 2289 movaps 0x20(TKEYP), KEY 2290 aesdec KEY, STATE1 2291 aesdec KEY, STATE2 2292 aesdec KEY, STATE3 2293 aesdec KEY, STATE4 2294 movaps 0x30(TKEYP), KEY 2295 aesdec KEY, STATE1 2296 aesdec KEY, STATE2 2297 aesdec KEY, STATE3 2298 aesdec KEY, STATE4 2299 movaps 0x40(TKEYP), KEY 2300 aesdec KEY, STATE1 2301 aesdec KEY, STATE2 2302 aesdec KEY, STATE3 2303 aesdec KEY, STATE4 2304 movaps 0x50(TKEYP), KEY 2305 aesdec KEY, STATE1 2306 aesdec KEY, STATE2 2307 aesdec KEY, STATE3 2308 aesdec KEY, STATE4 2309 movaps 0x60(TKEYP), KEY 2310 aesdec KEY, STATE1 2311 aesdec KEY, STATE2 2312 aesdec KEY, STATE3 2313 aesdec KEY, STATE4 2314 movaps 0x70(TKEYP), KEY 2315 aesdeclast KEY, STATE1 # last round 2316 aesdeclast KEY, STATE2 2317 aesdeclast KEY, STATE3 2318 aesdeclast KEY, STATE4 2319 ret 2320SYM_FUNC_END(_aesni_dec4) 2321 2322/* 2323 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2324 * size_t len) 2325 */ 2326SYM_FUNC_START(aesni_ecb_enc) 2327 FRAME_BEGIN 2328#ifndef __x86_64__ 2329 pushl LEN 2330 pushl KEYP 2331 pushl KLEN 2332 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2333 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2334 movl (FRAME_OFFSET+24)(%esp), INP # src 2335 movl (FRAME_OFFSET+28)(%esp), LEN # len 2336#endif 2337 test LEN, LEN # check length 2338 jz .Lecb_enc_ret 2339 mov 480(KEYP), KLEN 2340 cmp $16, LEN 2341 jb .Lecb_enc_ret 2342 cmp $64, LEN 2343 jb .Lecb_enc_loop1 2344.align 4 2345.Lecb_enc_loop4: 2346 movups (INP), STATE1 2347 movups 0x10(INP), STATE2 2348 movups 0x20(INP), STATE3 2349 movups 0x30(INP), STATE4 2350 call _aesni_enc4 2351 movups STATE1, (OUTP) 2352 movups STATE2, 0x10(OUTP) 2353 movups STATE3, 0x20(OUTP) 2354 movups STATE4, 0x30(OUTP) 2355 sub $64, LEN 2356 add $64, INP 2357 add $64, OUTP 2358 cmp $64, LEN 2359 jge .Lecb_enc_loop4 2360 cmp $16, LEN 2361 jb .Lecb_enc_ret 2362.align 4 2363.Lecb_enc_loop1: 2364 movups (INP), STATE1 2365 call _aesni_enc1 2366 movups STATE1, (OUTP) 2367 sub $16, LEN 2368 add $16, INP 2369 add $16, OUTP 2370 cmp $16, LEN 2371 jge .Lecb_enc_loop1 2372.Lecb_enc_ret: 2373#ifndef __x86_64__ 2374 popl KLEN 2375 popl KEYP 2376 popl LEN 2377#endif 2378 FRAME_END 2379 ret 2380SYM_FUNC_END(aesni_ecb_enc) 2381 2382/* 2383 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2384 * size_t len); 2385 */ 2386SYM_FUNC_START(aesni_ecb_dec) 2387 FRAME_BEGIN 2388#ifndef __x86_64__ 2389 pushl LEN 2390 pushl KEYP 2391 pushl KLEN 2392 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2393 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2394 movl (FRAME_OFFSET+24)(%esp), INP # src 2395 movl (FRAME_OFFSET+28)(%esp), LEN # len 2396#endif 2397 test LEN, LEN 2398 jz .Lecb_dec_ret 2399 mov 480(KEYP), KLEN 2400 add $240, KEYP 2401 cmp $16, LEN 2402 jb .Lecb_dec_ret 2403 cmp $64, LEN 2404 jb .Lecb_dec_loop1 2405.align 4 2406.Lecb_dec_loop4: 2407 movups (INP), STATE1 2408 movups 0x10(INP), STATE2 2409 movups 0x20(INP), STATE3 2410 movups 0x30(INP), STATE4 2411 call _aesni_dec4 2412 movups STATE1, (OUTP) 2413 movups STATE2, 0x10(OUTP) 2414 movups STATE3, 0x20(OUTP) 2415 movups STATE4, 0x30(OUTP) 2416 sub $64, LEN 2417 add $64, INP 2418 add $64, OUTP 2419 cmp $64, LEN 2420 jge .Lecb_dec_loop4 2421 cmp $16, LEN 2422 jb .Lecb_dec_ret 2423.align 4 2424.Lecb_dec_loop1: 2425 movups (INP), STATE1 2426 call _aesni_dec1 2427 movups STATE1, (OUTP) 2428 sub $16, LEN 2429 add $16, INP 2430 add $16, OUTP 2431 cmp $16, LEN 2432 jge .Lecb_dec_loop1 2433.Lecb_dec_ret: 2434#ifndef __x86_64__ 2435 popl KLEN 2436 popl KEYP 2437 popl LEN 2438#endif 2439 FRAME_END 2440 ret 2441SYM_FUNC_END(aesni_ecb_dec) 2442 2443/* 2444 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2445 * size_t len, u8 *iv) 2446 */ 2447SYM_FUNC_START(aesni_cbc_enc) 2448 FRAME_BEGIN 2449#ifndef __x86_64__ 2450 pushl IVP 2451 pushl LEN 2452 pushl KEYP 2453 pushl KLEN 2454 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2455 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2456 movl (FRAME_OFFSET+28)(%esp), INP # src 2457 movl (FRAME_OFFSET+32)(%esp), LEN # len 2458 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2459#endif 2460 cmp $16, LEN 2461 jb .Lcbc_enc_ret 2462 mov 480(KEYP), KLEN 2463 movups (IVP), STATE # load iv as initial state 2464.align 4 2465.Lcbc_enc_loop: 2466 movups (INP), IN # load input 2467 pxor IN, STATE 2468 call _aesni_enc1 2469 movups STATE, (OUTP) # store output 2470 sub $16, LEN 2471 add $16, INP 2472 add $16, OUTP 2473 cmp $16, LEN 2474 jge .Lcbc_enc_loop 2475 movups STATE, (IVP) 2476.Lcbc_enc_ret: 2477#ifndef __x86_64__ 2478 popl KLEN 2479 popl KEYP 2480 popl LEN 2481 popl IVP 2482#endif 2483 FRAME_END 2484 ret 2485SYM_FUNC_END(aesni_cbc_enc) 2486 2487/* 2488 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2489 * size_t len, u8 *iv) 2490 */ 2491SYM_FUNC_START(aesni_cbc_dec) 2492 FRAME_BEGIN 2493#ifndef __x86_64__ 2494 pushl IVP 2495 pushl LEN 2496 pushl KEYP 2497 pushl KLEN 2498 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2499 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2500 movl (FRAME_OFFSET+28)(%esp), INP # src 2501 movl (FRAME_OFFSET+32)(%esp), LEN # len 2502 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2503#endif 2504 cmp $16, LEN 2505 jb .Lcbc_dec_just_ret 2506 mov 480(KEYP), KLEN 2507 add $240, KEYP 2508 movups (IVP), IV 2509 cmp $64, LEN 2510 jb .Lcbc_dec_loop1 2511.align 4 2512.Lcbc_dec_loop4: 2513 movups (INP), IN1 2514 movaps IN1, STATE1 2515 movups 0x10(INP), IN2 2516 movaps IN2, STATE2 2517#ifdef __x86_64__ 2518 movups 0x20(INP), IN3 2519 movaps IN3, STATE3 2520 movups 0x30(INP), IN4 2521 movaps IN4, STATE4 2522#else 2523 movups 0x20(INP), IN1 2524 movaps IN1, STATE3 2525 movups 0x30(INP), IN2 2526 movaps IN2, STATE4 2527#endif 2528 call _aesni_dec4 2529 pxor IV, STATE1 2530#ifdef __x86_64__ 2531 pxor IN1, STATE2 2532 pxor IN2, STATE3 2533 pxor IN3, STATE4 2534 movaps IN4, IV 2535#else 2536 pxor IN1, STATE4 2537 movaps IN2, IV 2538 movups (INP), IN1 2539 pxor IN1, STATE2 2540 movups 0x10(INP), IN2 2541 pxor IN2, STATE3 2542#endif 2543 movups STATE1, (OUTP) 2544 movups STATE2, 0x10(OUTP) 2545 movups STATE3, 0x20(OUTP) 2546 movups STATE4, 0x30(OUTP) 2547 sub $64, LEN 2548 add $64, INP 2549 add $64, OUTP 2550 cmp $64, LEN 2551 jge .Lcbc_dec_loop4 2552 cmp $16, LEN 2553 jb .Lcbc_dec_ret 2554.align 4 2555.Lcbc_dec_loop1: 2556 movups (INP), IN 2557 movaps IN, STATE 2558 call _aesni_dec1 2559 pxor IV, STATE 2560 movups STATE, (OUTP) 2561 movaps IN, IV 2562 sub $16, LEN 2563 add $16, INP 2564 add $16, OUTP 2565 cmp $16, LEN 2566 jge .Lcbc_dec_loop1 2567.Lcbc_dec_ret: 2568 movups IV, (IVP) 2569.Lcbc_dec_just_ret: 2570#ifndef __x86_64__ 2571 popl KLEN 2572 popl KEYP 2573 popl LEN 2574 popl IVP 2575#endif 2576 FRAME_END 2577 ret 2578SYM_FUNC_END(aesni_cbc_dec) 2579 2580#ifdef __x86_64__ 2581.pushsection .rodata 2582.align 16 2583.Lbswap_mask: 2584 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2585.popsection 2586 2587/* 2588 * _aesni_inc_init: internal ABI 2589 * setup registers used by _aesni_inc 2590 * input: 2591 * IV 2592 * output: 2593 * CTR: == IV, in little endian 2594 * TCTR_LOW: == lower qword of CTR 2595 * INC: == 1, in little endian 2596 * BSWAP_MASK == endian swapping mask 2597 */ 2598SYM_FUNC_START_LOCAL(_aesni_inc_init) 2599 movaps .Lbswap_mask, BSWAP_MASK 2600 movaps IV, CTR 2601 pshufb BSWAP_MASK, CTR 2602 mov $1, TCTR_LOW 2603 movq TCTR_LOW, INC 2604 movq CTR, TCTR_LOW 2605 ret 2606SYM_FUNC_END(_aesni_inc_init) 2607 2608/* 2609 * _aesni_inc: internal ABI 2610 * Increase IV by 1, IV is in big endian 2611 * input: 2612 * IV 2613 * CTR: == IV, in little endian 2614 * TCTR_LOW: == lower qword of CTR 2615 * INC: == 1, in little endian 2616 * BSWAP_MASK == endian swapping mask 2617 * output: 2618 * IV: Increase by 1 2619 * changed: 2620 * CTR: == output IV, in little endian 2621 * TCTR_LOW: == lower qword of CTR 2622 */ 2623SYM_FUNC_START_LOCAL(_aesni_inc) 2624 paddq INC, CTR 2625 add $1, TCTR_LOW 2626 jnc .Linc_low 2627 pslldq $8, INC 2628 paddq INC, CTR 2629 psrldq $8, INC 2630.Linc_low: 2631 movaps CTR, IV 2632 pshufb BSWAP_MASK, IV 2633 ret 2634SYM_FUNC_END(_aesni_inc) 2635 2636/* 2637 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2638 * size_t len, u8 *iv) 2639 */ 2640SYM_FUNC_START(aesni_ctr_enc) 2641 FRAME_BEGIN 2642 cmp $16, LEN 2643 jb .Lctr_enc_just_ret 2644 mov 480(KEYP), KLEN 2645 movups (IVP), IV 2646 call _aesni_inc_init 2647 cmp $64, LEN 2648 jb .Lctr_enc_loop1 2649.align 4 2650.Lctr_enc_loop4: 2651 movaps IV, STATE1 2652 call _aesni_inc 2653 movups (INP), IN1 2654 movaps IV, STATE2 2655 call _aesni_inc 2656 movups 0x10(INP), IN2 2657 movaps IV, STATE3 2658 call _aesni_inc 2659 movups 0x20(INP), IN3 2660 movaps IV, STATE4 2661 call _aesni_inc 2662 movups 0x30(INP), IN4 2663 call _aesni_enc4 2664 pxor IN1, STATE1 2665 movups STATE1, (OUTP) 2666 pxor IN2, STATE2 2667 movups STATE2, 0x10(OUTP) 2668 pxor IN3, STATE3 2669 movups STATE3, 0x20(OUTP) 2670 pxor IN4, STATE4 2671 movups STATE4, 0x30(OUTP) 2672 sub $64, LEN 2673 add $64, INP 2674 add $64, OUTP 2675 cmp $64, LEN 2676 jge .Lctr_enc_loop4 2677 cmp $16, LEN 2678 jb .Lctr_enc_ret 2679.align 4 2680.Lctr_enc_loop1: 2681 movaps IV, STATE 2682 call _aesni_inc 2683 movups (INP), IN 2684 call _aesni_enc1 2685 pxor IN, STATE 2686 movups STATE, (OUTP) 2687 sub $16, LEN 2688 add $16, INP 2689 add $16, OUTP 2690 cmp $16, LEN 2691 jge .Lctr_enc_loop1 2692.Lctr_enc_ret: 2693 movups IV, (IVP) 2694.Lctr_enc_just_ret: 2695 FRAME_END 2696 ret 2697SYM_FUNC_END(aesni_ctr_enc) 2698 2699/* 2700 * _aesni_gf128mul_x_ble: internal ABI 2701 * Multiply in GF(2^128) for XTS IVs 2702 * input: 2703 * IV: current IV 2704 * GF128MUL_MASK == mask with 0x87 and 0x01 2705 * output: 2706 * IV: next IV 2707 * changed: 2708 * CTR: == temporary value 2709 */ 2710#define _aesni_gf128mul_x_ble() \ 2711 pshufd $0x13, IV, CTR; \ 2712 paddq IV, IV; \ 2713 psrad $31, CTR; \ 2714 pand GF128MUL_MASK, CTR; \ 2715 pxor CTR, IV; 2716 2717/* 2718 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, 2719 * const u8 *src, bool enc, le128 *iv) 2720 */ 2721SYM_FUNC_START(aesni_xts_crypt8) 2722 FRAME_BEGIN 2723 cmpb $0, %cl 2724 movl $0, %ecx 2725 movl $240, %r10d 2726 leaq _aesni_enc4, %r11 2727 leaq _aesni_dec4, %rax 2728 cmovel %r10d, %ecx 2729 cmoveq %rax, %r11 2730 2731 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2732 movups (IVP), IV 2733 2734 mov 480(KEYP), KLEN 2735 addq %rcx, KEYP 2736 2737 movdqa IV, STATE1 2738 movdqu 0x00(INP), INC 2739 pxor INC, STATE1 2740 movdqu IV, 0x00(OUTP) 2741 2742 _aesni_gf128mul_x_ble() 2743 movdqa IV, STATE2 2744 movdqu 0x10(INP), INC 2745 pxor INC, STATE2 2746 movdqu IV, 0x10(OUTP) 2747 2748 _aesni_gf128mul_x_ble() 2749 movdqa IV, STATE3 2750 movdqu 0x20(INP), INC 2751 pxor INC, STATE3 2752 movdqu IV, 0x20(OUTP) 2753 2754 _aesni_gf128mul_x_ble() 2755 movdqa IV, STATE4 2756 movdqu 0x30(INP), INC 2757 pxor INC, STATE4 2758 movdqu IV, 0x30(OUTP) 2759 2760 CALL_NOSPEC r11 2761 2762 movdqu 0x00(OUTP), INC 2763 pxor INC, STATE1 2764 movdqu STATE1, 0x00(OUTP) 2765 2766 _aesni_gf128mul_x_ble() 2767 movdqa IV, STATE1 2768 movdqu 0x40(INP), INC 2769 pxor INC, STATE1 2770 movdqu IV, 0x40(OUTP) 2771 2772 movdqu 0x10(OUTP), INC 2773 pxor INC, STATE2 2774 movdqu STATE2, 0x10(OUTP) 2775 2776 _aesni_gf128mul_x_ble() 2777 movdqa IV, STATE2 2778 movdqu 0x50(INP), INC 2779 pxor INC, STATE2 2780 movdqu IV, 0x50(OUTP) 2781 2782 movdqu 0x20(OUTP), INC 2783 pxor INC, STATE3 2784 movdqu STATE3, 0x20(OUTP) 2785 2786 _aesni_gf128mul_x_ble() 2787 movdqa IV, STATE3 2788 movdqu 0x60(INP), INC 2789 pxor INC, STATE3 2790 movdqu IV, 0x60(OUTP) 2791 2792 movdqu 0x30(OUTP), INC 2793 pxor INC, STATE4 2794 movdqu STATE4, 0x30(OUTP) 2795 2796 _aesni_gf128mul_x_ble() 2797 movdqa IV, STATE4 2798 movdqu 0x70(INP), INC 2799 pxor INC, STATE4 2800 movdqu IV, 0x70(OUTP) 2801 2802 _aesni_gf128mul_x_ble() 2803 movups IV, (IVP) 2804 2805 CALL_NOSPEC r11 2806 2807 movdqu 0x40(OUTP), INC 2808 pxor INC, STATE1 2809 movdqu STATE1, 0x40(OUTP) 2810 2811 movdqu 0x50(OUTP), INC 2812 pxor INC, STATE2 2813 movdqu STATE2, 0x50(OUTP) 2814 2815 movdqu 0x60(OUTP), INC 2816 pxor INC, STATE3 2817 movdqu STATE3, 0x60(OUTP) 2818 2819 movdqu 0x70(OUTP), INC 2820 pxor INC, STATE4 2821 movdqu STATE4, 0x70(OUTP) 2822 2823 FRAME_END 2824 ret 2825SYM_FUNC_END(aesni_xts_crypt8) 2826 2827#endif 2828