1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/frame.h> 35#include <asm/nospec-branch.h> 36 37/* 38 * The following macros are used to move an (un)aligned 16 byte value to/from 39 * an XMM register. This can done for either FP or integer values, for FP use 40 * movaps (move aligned packed single) or integer use movdqa (move double quad 41 * aligned). It doesn't make a performance difference which instruction is used 42 * since Nehalem (original Core i7) was released. However, the movaps is a byte 43 * shorter, so that is the one we'll use for now. (same for unaligned). 44 */ 45#define MOVADQ movaps 46#define MOVUDQ movups 47 48#ifdef __x86_64__ 49 50# constants in mergeable sections, linker can reorder and merge 51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 52.align 16 53.Lgf128mul_x_ble_mask: 54 .octa 0x00000000000000010000000000000087 55.section .rodata.cst16.POLY, "aM", @progbits, 16 56.align 16 57POLY: .octa 0xC2000000000000000000000000000001 58.section .rodata.cst16.TWOONE, "aM", @progbits, 16 59.align 16 60TWOONE: .octa 0x00000001000000000000000000000001 61 62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 63.align 16 64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 65.section .rodata.cst16.MASK1, "aM", @progbits, 16 66.align 16 67MASK1: .octa 0x0000000000000000ffffffffffffffff 68.section .rodata.cst16.MASK2, "aM", @progbits, 16 69.align 16 70MASK2: .octa 0xffffffffffffffff0000000000000000 71.section .rodata.cst16.ONE, "aM", @progbits, 16 72.align 16 73ONE: .octa 0x00000000000000000000000000000001 74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 75.align 16 76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 77.section .rodata.cst16.dec, "aM", @progbits, 16 78.align 16 79dec: .octa 0x1 80.section .rodata.cst16.enc, "aM", @progbits, 16 81.align 16 82enc: .octa 0x2 83 84# order of these constants should not change. 85# more specifically, ALL_F should follow SHIFT_MASK, 86# and zero should follow ALL_F 87.section .rodata, "a", @progbits 88.align 16 89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 90ALL_F: .octa 0xffffffffffffffffffffffffffffffff 91 .octa 0x00000000000000000000000000000000 92 93.text 94 95 96#define STACK_OFFSET 8*3 97 98#define AadHash 16*0 99#define AadLen 16*1 100#define InLen (16*1)+8 101#define PBlockEncKey 16*2 102#define OrigIV 16*3 103#define CurCount 16*4 104#define PBlockLen 16*5 105#define HashKey 16*6 // store HashKey <<1 mod poly here 106#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 107#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 108#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 109#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 110 // bits of HashKey <<1 mod poly here 111 //(for Karatsuba purposes) 112#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 113 // bits of HashKey^2 <<1 mod poly here 114 // (for Karatsuba purposes) 115#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 116 // bits of HashKey^3 <<1 mod poly here 117 // (for Karatsuba purposes) 118#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 119 // bits of HashKey^4 <<1 mod poly here 120 // (for Karatsuba purposes) 121 122#define arg1 rdi 123#define arg2 rsi 124#define arg3 rdx 125#define arg4 rcx 126#define arg5 r8 127#define arg6 r9 128#define arg7 STACK_OFFSET+8(%rsp) 129#define arg8 STACK_OFFSET+16(%rsp) 130#define arg9 STACK_OFFSET+24(%rsp) 131#define arg10 STACK_OFFSET+32(%rsp) 132#define arg11 STACK_OFFSET+40(%rsp) 133#define keysize 2*15*16(%arg1) 134#endif 135 136 137#define STATE1 %xmm0 138#define STATE2 %xmm4 139#define STATE3 %xmm5 140#define STATE4 %xmm6 141#define STATE STATE1 142#define IN1 %xmm1 143#define IN2 %xmm7 144#define IN3 %xmm8 145#define IN4 %xmm9 146#define IN IN1 147#define KEY %xmm2 148#define IV %xmm3 149 150#define BSWAP_MASK %xmm10 151#define CTR %xmm11 152#define INC %xmm12 153 154#define GF128MUL_MASK %xmm10 155 156#ifdef __x86_64__ 157#define AREG %rax 158#define KEYP %rdi 159#define OUTP %rsi 160#define UKEYP OUTP 161#define INP %rdx 162#define LEN %rcx 163#define IVP %r8 164#define KLEN %r9d 165#define T1 %r10 166#define TKEYP T1 167#define T2 %r11 168#define TCTR_LOW T2 169#else 170#define AREG %eax 171#define KEYP %edi 172#define OUTP AREG 173#define UKEYP OUTP 174#define INP %edx 175#define LEN %esi 176#define IVP %ebp 177#define KLEN %ebx 178#define T1 %ecx 179#define TKEYP T1 180#endif 181 182.macro FUNC_SAVE 183 push %r12 184 push %r13 185 push %r14 186# 187# states of %xmm registers %xmm6:%xmm15 not saved 188# all %xmm registers are clobbered 189# 190.endm 191 192 193.macro FUNC_RESTORE 194 pop %r14 195 pop %r13 196 pop %r12 197.endm 198 199# Precompute hashkeys. 200# Input: Hash subkey. 201# Output: HashKeys stored in gcm_context_data. Only needs to be called 202# once per key. 203# clobbers r12, and tmp xmm registers. 204.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 205 mov \SUBKEY, %r12 206 movdqu (%r12), \TMP3 207 movdqa SHUF_MASK(%rip), \TMP2 208 PSHUFB_XMM \TMP2, \TMP3 209 210 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 211 212 movdqa \TMP3, \TMP2 213 psllq $1, \TMP3 214 psrlq $63, \TMP2 215 movdqa \TMP2, \TMP1 216 pslldq $8, \TMP2 217 psrldq $8, \TMP1 218 por \TMP2, \TMP3 219 220 # reduce HashKey<<1 221 222 pshufd $0x24, \TMP1, \TMP2 223 pcmpeqd TWOONE(%rip), \TMP2 224 pand POLY(%rip), \TMP2 225 pxor \TMP2, \TMP3 226 movdqa \TMP3, HashKey(%arg2) 227 228 movdqa \TMP3, \TMP5 229 pshufd $78, \TMP3, \TMP1 230 pxor \TMP3, \TMP1 231 movdqa \TMP1, HashKey_k(%arg2) 232 233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 234# TMP5 = HashKey^2<<1 (mod poly) 235 movdqa \TMP5, HashKey_2(%arg2) 236# HashKey_2 = HashKey^2<<1 (mod poly) 237 pshufd $78, \TMP5, \TMP1 238 pxor \TMP5, \TMP1 239 movdqa \TMP1, HashKey_2_k(%arg2) 240 241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 242# TMP5 = HashKey^3<<1 (mod poly) 243 movdqa \TMP5, HashKey_3(%arg2) 244 pshufd $78, \TMP5, \TMP1 245 pxor \TMP5, \TMP1 246 movdqa \TMP1, HashKey_3_k(%arg2) 247 248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 249# TMP5 = HashKey^3<<1 (mod poly) 250 movdqa \TMP5, HashKey_4(%arg2) 251 pshufd $78, \TMP5, \TMP1 252 pxor \TMP5, \TMP1 253 movdqa \TMP1, HashKey_4_k(%arg2) 254.endm 255 256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 258.macro GCM_INIT Iv SUBKEY AAD AADLEN 259 mov \AADLEN, %r11 260 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 261 xor %r11, %r11 262 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 263 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 264 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 265 mov \Iv, %rax 266 movdqu (%rax), %xmm0 267 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 268 269 movdqa SHUF_MASK(%rip), %xmm2 270 PSHUFB_XMM %xmm2, %xmm0 271 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 272 273 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 274 movdqa HashKey(%arg2), %xmm13 275 276 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 277 %xmm4, %xmm5, %xmm6 278.endm 279 280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 281# struct has been initialized by GCM_INIT. 282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 283# Clobbers rax, r10-r13, and xmm0-xmm15 284.macro GCM_ENC_DEC operation 285 movdqu AadHash(%arg2), %xmm8 286 movdqu HashKey(%arg2), %xmm13 287 add %arg5, InLen(%arg2) 288 289 xor %r11, %r11 # initialise the data pointer offset as zero 290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 291 292 sub %r11, %arg5 # sub partial block data used 293 mov %arg5, %r13 # save the number of bytes 294 295 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 296 mov %r13, %r12 297 # Encrypt/Decrypt first few blocks 298 299 and $(3<<4), %r12 300 jz _initial_num_blocks_is_0_\@ 301 cmp $(2<<4), %r12 302 jb _initial_num_blocks_is_1_\@ 303 je _initial_num_blocks_is_2_\@ 304_initial_num_blocks_is_3_\@: 305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 307 sub $48, %r13 308 jmp _initial_blocks_\@ 309_initial_num_blocks_is_2_\@: 310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 312 sub $32, %r13 313 jmp _initial_blocks_\@ 314_initial_num_blocks_is_1_\@: 315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 317 sub $16, %r13 318 jmp _initial_blocks_\@ 319_initial_num_blocks_is_0_\@: 320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 322_initial_blocks_\@: 323 324 # Main loop - Encrypt/Decrypt remaining blocks 325 326 cmp $0, %r13 327 je _zero_cipher_left_\@ 328 sub $64, %r13 329 je _four_cipher_left_\@ 330_crypt_by_4_\@: 331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 333 %xmm7, %xmm8, enc 334 add $64, %r11 335 sub $64, %r13 336 jne _crypt_by_4_\@ 337_four_cipher_left_\@: 338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 340_zero_cipher_left_\@: 341 movdqu %xmm8, AadHash(%arg2) 342 movdqu %xmm0, CurCount(%arg2) 343 344 mov %arg5, %r13 345 and $15, %r13 # %r13 = arg5 (mod 16) 346 je _multiple_of_16_bytes_\@ 347 348 mov %r13, PBlockLen(%arg2) 349 350 # Handle the last <16 Byte block separately 351 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 352 movdqu %xmm0, CurCount(%arg2) 353 movdqa SHUF_MASK(%rip), %xmm10 354 PSHUFB_XMM %xmm10, %xmm0 355 356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 357 movdqu %xmm0, PBlockEncKey(%arg2) 358 359 cmp $16, %arg5 360 jge _large_enough_update_\@ 361 362 lea (%arg4,%r11,1), %r10 363 mov %r13, %r12 364 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 365 jmp _data_read_\@ 366 367_large_enough_update_\@: 368 sub $16, %r11 369 add %r13, %r11 370 371 # receive the last <16 Byte block 372 movdqu (%arg4, %r11, 1), %xmm1 373 374 sub %r13, %r11 375 add $16, %r11 376 377 lea SHIFT_MASK+16(%rip), %r12 378 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 379 # (r13 is the number of bytes in plaintext mod 16) 380 sub %r13, %r12 381 # get the appropriate shuffle mask 382 movdqu (%r12), %xmm2 383 # shift right 16-r13 bytes 384 PSHUFB_XMM %xmm2, %xmm1 385 386_data_read_\@: 387 lea ALL_F+16(%rip), %r12 388 sub %r13, %r12 389 390.ifc \operation, dec 391 movdqa %xmm1, %xmm2 392.endif 393 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 394 movdqu (%r12), %xmm1 395 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 396 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 397.ifc \operation, dec 398 pand %xmm1, %xmm2 399 movdqa SHUF_MASK(%rip), %xmm10 400 PSHUFB_XMM %xmm10 ,%xmm2 401 402 pxor %xmm2, %xmm8 403.else 404 movdqa SHUF_MASK(%rip), %xmm10 405 PSHUFB_XMM %xmm10,%xmm0 406 407 pxor %xmm0, %xmm8 408.endif 409 410 movdqu %xmm8, AadHash(%arg2) 411.ifc \operation, enc 412 # GHASH computation for the last <16 byte block 413 movdqa SHUF_MASK(%rip), %xmm10 414 # shuffle xmm0 back to output as ciphertext 415 PSHUFB_XMM %xmm10, %xmm0 416.endif 417 418 # Output %r13 bytes 419 MOVQ_R64_XMM %xmm0, %rax 420 cmp $8, %r13 421 jle _less_than_8_bytes_left_\@ 422 mov %rax, (%arg3 , %r11, 1) 423 add $8, %r11 424 psrldq $8, %xmm0 425 MOVQ_R64_XMM %xmm0, %rax 426 sub $8, %r13 427_less_than_8_bytes_left_\@: 428 mov %al, (%arg3, %r11, 1) 429 add $1, %r11 430 shr $8, %rax 431 sub $1, %r13 432 jne _less_than_8_bytes_left_\@ 433_multiple_of_16_bytes_\@: 434.endm 435 436# GCM_COMPLETE Finishes update of tag of last partial block 437# Output: Authorization Tag (AUTH_TAG) 438# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 439.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 440 movdqu AadHash(%arg2), %xmm8 441 movdqu HashKey(%arg2), %xmm13 442 443 mov PBlockLen(%arg2), %r12 444 445 cmp $0, %r12 446 je _partial_done\@ 447 448 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 449 450_partial_done\@: 451 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 452 shl $3, %r12 # convert into number of bits 453 movd %r12d, %xmm15 # len(A) in %xmm15 454 mov InLen(%arg2), %r12 455 shl $3, %r12 # len(C) in bits (*128) 456 MOVQ_R64_XMM %r12, %xmm1 457 458 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 459 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 460 pxor %xmm15, %xmm8 461 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 462 # final GHASH computation 463 movdqa SHUF_MASK(%rip), %xmm10 464 PSHUFB_XMM %xmm10, %xmm8 465 466 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 467 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 468 pxor %xmm8, %xmm0 469_return_T_\@: 470 mov \AUTHTAG, %r10 # %r10 = authTag 471 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 472 cmp $16, %r11 473 je _T_16_\@ 474 cmp $8, %r11 475 jl _T_4_\@ 476_T_8_\@: 477 MOVQ_R64_XMM %xmm0, %rax 478 mov %rax, (%r10) 479 add $8, %r10 480 sub $8, %r11 481 psrldq $8, %xmm0 482 cmp $0, %r11 483 je _return_T_done_\@ 484_T_4_\@: 485 movd %xmm0, %eax 486 mov %eax, (%r10) 487 add $4, %r10 488 sub $4, %r11 489 psrldq $4, %xmm0 490 cmp $0, %r11 491 je _return_T_done_\@ 492_T_123_\@: 493 movd %xmm0, %eax 494 cmp $2, %r11 495 jl _T_1_\@ 496 mov %ax, (%r10) 497 cmp $2, %r11 498 je _return_T_done_\@ 499 add $2, %r10 500 sar $16, %eax 501_T_1_\@: 502 mov %al, (%r10) 503 jmp _return_T_done_\@ 504_T_16_\@: 505 movdqu %xmm0, (%r10) 506_return_T_done_\@: 507.endm 508 509#ifdef __x86_64__ 510/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 511* 512* 513* Input: A and B (128-bits each, bit-reflected) 514* Output: C = A*B*x mod poly, (i.e. >>1 ) 515* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 516* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 517* 518*/ 519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 520 movdqa \GH, \TMP1 521 pshufd $78, \GH, \TMP2 522 pshufd $78, \HK, \TMP3 523 pxor \GH, \TMP2 # TMP2 = a1+a0 524 pxor \HK, \TMP3 # TMP3 = b1+b0 525 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 526 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 527 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 528 pxor \GH, \TMP2 529 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 530 movdqa \TMP2, \TMP3 531 pslldq $8, \TMP3 # left shift TMP3 2 DWs 532 psrldq $8, \TMP2 # right shift TMP2 2 DWs 533 pxor \TMP3, \GH 534 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 535 536 # first phase of the reduction 537 538 movdqa \GH, \TMP2 539 movdqa \GH, \TMP3 540 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 541 # in in order to perform 542 # independent shifts 543 pslld $31, \TMP2 # packed right shift <<31 544 pslld $30, \TMP3 # packed right shift <<30 545 pslld $25, \TMP4 # packed right shift <<25 546 pxor \TMP3, \TMP2 # xor the shifted versions 547 pxor \TMP4, \TMP2 548 movdqa \TMP2, \TMP5 549 psrldq $4, \TMP5 # right shift TMP5 1 DW 550 pslldq $12, \TMP2 # left shift TMP2 3 DWs 551 pxor \TMP2, \GH 552 553 # second phase of the reduction 554 555 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 556 # in in order to perform 557 # independent shifts 558 movdqa \GH,\TMP3 559 movdqa \GH,\TMP4 560 psrld $1,\TMP2 # packed left shift >>1 561 psrld $2,\TMP3 # packed left shift >>2 562 psrld $7,\TMP4 # packed left shift >>7 563 pxor \TMP3,\TMP2 # xor the shifted versions 564 pxor \TMP4,\TMP2 565 pxor \TMP5, \TMP2 566 pxor \TMP2, \GH 567 pxor \TMP1, \GH # result is in TMP1 568.endm 569 570# Reads DLEN bytes starting at DPTR and stores in XMMDst 571# where 0 < DLEN < 16 572# Clobbers %rax, DLEN and XMM1 573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 574 cmp $8, \DLEN 575 jl _read_lt8_\@ 576 mov (\DPTR), %rax 577 MOVQ_R64_XMM %rax, \XMMDst 578 sub $8, \DLEN 579 jz _done_read_partial_block_\@ 580 xor %eax, %eax 581_read_next_byte_\@: 582 shl $8, %rax 583 mov 7(\DPTR, \DLEN, 1), %al 584 dec \DLEN 585 jnz _read_next_byte_\@ 586 MOVQ_R64_XMM %rax, \XMM1 587 pslldq $8, \XMM1 588 por \XMM1, \XMMDst 589 jmp _done_read_partial_block_\@ 590_read_lt8_\@: 591 xor %eax, %eax 592_read_next_byte_lt8_\@: 593 shl $8, %rax 594 mov -1(\DPTR, \DLEN, 1), %al 595 dec \DLEN 596 jnz _read_next_byte_lt8_\@ 597 MOVQ_R64_XMM %rax, \XMMDst 598_done_read_partial_block_\@: 599.endm 600 601# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 602# clobbers r10-11, xmm14 603.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 604 TMP6 TMP7 605 MOVADQ SHUF_MASK(%rip), %xmm14 606 mov \AAD, %r10 # %r10 = AAD 607 mov \AADLEN, %r11 # %r11 = aadLen 608 pxor \TMP7, \TMP7 609 pxor \TMP6, \TMP6 610 611 cmp $16, %r11 612 jl _get_AAD_rest\@ 613_get_AAD_blocks\@: 614 movdqu (%r10), \TMP7 615 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data 616 pxor \TMP7, \TMP6 617 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 618 add $16, %r10 619 sub $16, %r11 620 cmp $16, %r11 621 jge _get_AAD_blocks\@ 622 623 movdqu \TMP6, \TMP7 624 625 /* read the last <16B of AAD */ 626_get_AAD_rest\@: 627 cmp $0, %r11 628 je _get_AAD_done\@ 629 630 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 631 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data 632 pxor \TMP6, \TMP7 633 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 634 movdqu \TMP7, \TMP6 635 636_get_AAD_done\@: 637 movdqu \TMP6, AadHash(%arg2) 638.endm 639 640# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 641# between update calls. 642# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 643# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 644# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 646 AAD_HASH operation 647 mov PBlockLen(%arg2), %r13 648 cmp $0, %r13 649 je _partial_block_done_\@ # Leave Macro if no partial blocks 650 # Read in input data without over reading 651 cmp $16, \PLAIN_CYPH_LEN 652 jl _fewer_than_16_bytes_\@ 653 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 654 jmp _data_read_\@ 655 656_fewer_than_16_bytes_\@: 657 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 658 mov \PLAIN_CYPH_LEN, %r12 659 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 660 661 mov PBlockLen(%arg2), %r13 662 663_data_read_\@: # Finished reading in data 664 665 movdqu PBlockEncKey(%arg2), %xmm9 666 movdqu HashKey(%arg2), %xmm13 667 668 lea SHIFT_MASK(%rip), %r12 669 670 # adjust the shuffle mask pointer to be able to shift r13 bytes 671 # r16-r13 is the number of bytes in plaintext mod 16) 672 add %r13, %r12 673 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 674 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes 675 676.ifc \operation, dec 677 movdqa %xmm1, %xmm3 678 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 679 680 mov \PLAIN_CYPH_LEN, %r10 681 add %r13, %r10 682 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 683 sub $16, %r10 684 # Determine if if partial block is not being filled and 685 # shift mask accordingly 686 jge _no_extra_mask_1_\@ 687 sub %r10, %r12 688_no_extra_mask_1_\@: 689 690 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 691 # get the appropriate mask to mask out bottom r13 bytes of xmm9 692 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 693 694 pand %xmm1, %xmm3 695 movdqa SHUF_MASK(%rip), %xmm10 696 PSHUFB_XMM %xmm10, %xmm3 697 PSHUFB_XMM %xmm2, %xmm3 698 pxor %xmm3, \AAD_HASH 699 700 cmp $0, %r10 701 jl _partial_incomplete_1_\@ 702 703 # GHASH computation for the last <16 Byte block 704 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 705 xor %rax,%rax 706 707 mov %rax, PBlockLen(%arg2) 708 jmp _dec_done_\@ 709_partial_incomplete_1_\@: 710 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 711_dec_done_\@: 712 movdqu \AAD_HASH, AadHash(%arg2) 713.else 714 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 715 716 mov \PLAIN_CYPH_LEN, %r10 717 add %r13, %r10 718 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 719 sub $16, %r10 720 # Determine if if partial block is not being filled and 721 # shift mask accordingly 722 jge _no_extra_mask_2_\@ 723 sub %r10, %r12 724_no_extra_mask_2_\@: 725 726 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 727 # get the appropriate mask to mask out bottom r13 bytes of xmm9 728 pand %xmm1, %xmm9 729 730 movdqa SHUF_MASK(%rip), %xmm1 731 PSHUFB_XMM %xmm1, %xmm9 732 PSHUFB_XMM %xmm2, %xmm9 733 pxor %xmm9, \AAD_HASH 734 735 cmp $0, %r10 736 jl _partial_incomplete_2_\@ 737 738 # GHASH computation for the last <16 Byte block 739 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 740 xor %rax,%rax 741 742 mov %rax, PBlockLen(%arg2) 743 jmp _encode_done_\@ 744_partial_incomplete_2_\@: 745 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 746_encode_done_\@: 747 movdqu \AAD_HASH, AadHash(%arg2) 748 749 movdqa SHUF_MASK(%rip), %xmm10 750 # shuffle xmm9 back to output as ciphertext 751 PSHUFB_XMM %xmm10, %xmm9 752 PSHUFB_XMM %xmm2, %xmm9 753.endif 754 # output encrypted Bytes 755 cmp $0, %r10 756 jl _partial_fill_\@ 757 mov %r13, %r12 758 mov $16, %r13 759 # Set r13 to be the number of bytes to write out 760 sub %r12, %r13 761 jmp _count_set_\@ 762_partial_fill_\@: 763 mov \PLAIN_CYPH_LEN, %r13 764_count_set_\@: 765 movdqa %xmm9, %xmm0 766 MOVQ_R64_XMM %xmm0, %rax 767 cmp $8, %r13 768 jle _less_than_8_bytes_left_\@ 769 770 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 771 add $8, \DATA_OFFSET 772 psrldq $8, %xmm0 773 MOVQ_R64_XMM %xmm0, %rax 774 sub $8, %r13 775_less_than_8_bytes_left_\@: 776 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 777 add $1, \DATA_OFFSET 778 shr $8, %rax 779 sub $1, %r13 780 jne _less_than_8_bytes_left_\@ 781_partial_block_done_\@: 782.endm # PARTIAL_BLOCK 783 784/* 785* if a = number of total plaintext bytes 786* b = floor(a/16) 787* num_initial_blocks = b mod 4 788* encrypt the initial num_initial_blocks blocks and apply ghash on 789* the ciphertext 790* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 791* are clobbered 792* arg1, %arg2, %arg3 are used as a pointer only, not modified 793*/ 794 795 796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 797 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 798 MOVADQ SHUF_MASK(%rip), %xmm14 799 800 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 801 802 # start AES for num_initial_blocks blocks 803 804 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 805 806.if (\i == 5) || (\i == 6) || (\i == 7) 807 808 MOVADQ ONE(%RIP),\TMP1 809 MOVADQ 0(%arg1),\TMP2 810.irpc index, \i_seq 811 paddd \TMP1, \XMM0 # INCR Y0 812.ifc \operation, dec 813 movdqa \XMM0, %xmm\index 814.else 815 MOVADQ \XMM0, %xmm\index 816.endif 817 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 818 pxor \TMP2, %xmm\index 819.endr 820 lea 0x10(%arg1),%r10 821 mov keysize,%eax 822 shr $2,%eax # 128->4, 192->6, 256->8 823 add $5,%eax # 128->9, 192->11, 256->13 824 825aes_loop_initial_\@: 826 MOVADQ (%r10),\TMP1 827.irpc index, \i_seq 828 AESENC \TMP1, %xmm\index 829.endr 830 add $16,%r10 831 sub $1,%eax 832 jnz aes_loop_initial_\@ 833 834 MOVADQ (%r10), \TMP1 835.irpc index, \i_seq 836 AESENCLAST \TMP1, %xmm\index # Last Round 837.endr 838.irpc index, \i_seq 839 movdqu (%arg4 , %r11, 1), \TMP1 840 pxor \TMP1, %xmm\index 841 movdqu %xmm\index, (%arg3 , %r11, 1) 842 # write back plaintext/ciphertext for num_initial_blocks 843 add $16, %r11 844 845.ifc \operation, dec 846 movdqa \TMP1, %xmm\index 847.endif 848 PSHUFB_XMM %xmm14, %xmm\index 849 850 # prepare plaintext/ciphertext for GHASH computation 851.endr 852.endif 853 854 # apply GHASH on num_initial_blocks blocks 855 856.if \i == 5 857 pxor %xmm5, %xmm6 858 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 859 pxor %xmm6, %xmm7 860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 861 pxor %xmm7, %xmm8 862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 863.elseif \i == 6 864 pxor %xmm6, %xmm7 865 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 866 pxor %xmm7, %xmm8 867 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 868.elseif \i == 7 869 pxor %xmm7, %xmm8 870 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 871.endif 872 cmp $64, %r13 873 jl _initial_blocks_done\@ 874 # no need for precomputed values 875/* 876* 877* Precomputations for HashKey parallel with encryption of first 4 blocks. 878* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 879*/ 880 MOVADQ ONE(%RIP),\TMP1 881 paddd \TMP1, \XMM0 # INCR Y0 882 MOVADQ \XMM0, \XMM1 883 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 884 885 paddd \TMP1, \XMM0 # INCR Y0 886 MOVADQ \XMM0, \XMM2 887 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 888 889 paddd \TMP1, \XMM0 # INCR Y0 890 MOVADQ \XMM0, \XMM3 891 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 892 893 paddd \TMP1, \XMM0 # INCR Y0 894 MOVADQ \XMM0, \XMM4 895 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 896 897 MOVADQ 0(%arg1),\TMP1 898 pxor \TMP1, \XMM1 899 pxor \TMP1, \XMM2 900 pxor \TMP1, \XMM3 901 pxor \TMP1, \XMM4 902.irpc index, 1234 # do 4 rounds 903 movaps 0x10*\index(%arg1), \TMP1 904 AESENC \TMP1, \XMM1 905 AESENC \TMP1, \XMM2 906 AESENC \TMP1, \XMM3 907 AESENC \TMP1, \XMM4 908.endr 909.irpc index, 56789 # do next 5 rounds 910 movaps 0x10*\index(%arg1), \TMP1 911 AESENC \TMP1, \XMM1 912 AESENC \TMP1, \XMM2 913 AESENC \TMP1, \XMM3 914 AESENC \TMP1, \XMM4 915.endr 916 lea 0xa0(%arg1),%r10 917 mov keysize,%eax 918 shr $2,%eax # 128->4, 192->6, 256->8 919 sub $4,%eax # 128->0, 192->2, 256->4 920 jz aes_loop_pre_done\@ 921 922aes_loop_pre_\@: 923 MOVADQ (%r10),\TMP2 924.irpc index, 1234 925 AESENC \TMP2, %xmm\index 926.endr 927 add $16,%r10 928 sub $1,%eax 929 jnz aes_loop_pre_\@ 930 931aes_loop_pre_done\@: 932 MOVADQ (%r10), \TMP2 933 AESENCLAST \TMP2, \XMM1 934 AESENCLAST \TMP2, \XMM2 935 AESENCLAST \TMP2, \XMM3 936 AESENCLAST \TMP2, \XMM4 937 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 938 pxor \TMP1, \XMM1 939.ifc \operation, dec 940 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 941 movdqa \TMP1, \XMM1 942.endif 943 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 944 pxor \TMP1, \XMM2 945.ifc \operation, dec 946 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 947 movdqa \TMP1, \XMM2 948.endif 949 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 950 pxor \TMP1, \XMM3 951.ifc \operation, dec 952 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 953 movdqa \TMP1, \XMM3 954.endif 955 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 956 pxor \TMP1, \XMM4 957.ifc \operation, dec 958 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 959 movdqa \TMP1, \XMM4 960.else 961 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 962 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 963 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 964 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 965.endif 966 967 add $64, %r11 968 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 969 pxor \XMMDst, \XMM1 970# combine GHASHed value with the corresponding ciphertext 971 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 972 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 973 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 974 975_initial_blocks_done\@: 976 977.endm 978 979/* 980* encrypt 4 blocks at a time 981* ghash the 4 previously encrypted ciphertext blocks 982* arg1, %arg3, %arg4 are used as pointers only, not modified 983* %r11 is the data offset value 984*/ 985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 987 988 movdqa \XMM1, \XMM5 989 movdqa \XMM2, \XMM6 990 movdqa \XMM3, \XMM7 991 movdqa \XMM4, \XMM8 992 993 movdqa SHUF_MASK(%rip), %xmm15 994 # multiply TMP5 * HashKey using karatsuba 995 996 movdqa \XMM5, \TMP4 997 pshufd $78, \XMM5, \TMP6 998 pxor \XMM5, \TMP6 999 paddd ONE(%rip), \XMM0 # INCR CNT 1000 movdqa HashKey_4(%arg2), \TMP5 1001 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1002 movdqa \XMM0, \XMM1 1003 paddd ONE(%rip), \XMM0 # INCR CNT 1004 movdqa \XMM0, \XMM2 1005 paddd ONE(%rip), \XMM0 # INCR CNT 1006 movdqa \XMM0, \XMM3 1007 paddd ONE(%rip), \XMM0 # INCR CNT 1008 movdqa \XMM0, \XMM4 1009 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1010 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1011 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1012 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1013 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1014 1015 pxor (%arg1), \XMM1 1016 pxor (%arg1), \XMM2 1017 pxor (%arg1), \XMM3 1018 pxor (%arg1), \XMM4 1019 movdqa HashKey_4_k(%arg2), \TMP5 1020 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1021 movaps 0x10(%arg1), \TMP1 1022 AESENC \TMP1, \XMM1 # Round 1 1023 AESENC \TMP1, \XMM2 1024 AESENC \TMP1, \XMM3 1025 AESENC \TMP1, \XMM4 1026 movaps 0x20(%arg1), \TMP1 1027 AESENC \TMP1, \XMM1 # Round 2 1028 AESENC \TMP1, \XMM2 1029 AESENC \TMP1, \XMM3 1030 AESENC \TMP1, \XMM4 1031 movdqa \XMM6, \TMP1 1032 pshufd $78, \XMM6, \TMP2 1033 pxor \XMM6, \TMP2 1034 movdqa HashKey_3(%arg2), \TMP5 1035 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1036 movaps 0x30(%arg1), \TMP3 1037 AESENC \TMP3, \XMM1 # Round 3 1038 AESENC \TMP3, \XMM2 1039 AESENC \TMP3, \XMM3 1040 AESENC \TMP3, \XMM4 1041 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1042 movaps 0x40(%arg1), \TMP3 1043 AESENC \TMP3, \XMM1 # Round 4 1044 AESENC \TMP3, \XMM2 1045 AESENC \TMP3, \XMM3 1046 AESENC \TMP3, \XMM4 1047 movdqa HashKey_3_k(%arg2), \TMP5 1048 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1049 movaps 0x50(%arg1), \TMP3 1050 AESENC \TMP3, \XMM1 # Round 5 1051 AESENC \TMP3, \XMM2 1052 AESENC \TMP3, \XMM3 1053 AESENC \TMP3, \XMM4 1054 pxor \TMP1, \TMP4 1055# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1056 pxor \XMM6, \XMM5 1057 pxor \TMP2, \TMP6 1058 movdqa \XMM7, \TMP1 1059 pshufd $78, \XMM7, \TMP2 1060 pxor \XMM7, \TMP2 1061 movdqa HashKey_2(%arg2), \TMP5 1062 1063 # Multiply TMP5 * HashKey using karatsuba 1064 1065 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1066 movaps 0x60(%arg1), \TMP3 1067 AESENC \TMP3, \XMM1 # Round 6 1068 AESENC \TMP3, \XMM2 1069 AESENC \TMP3, \XMM3 1070 AESENC \TMP3, \XMM4 1071 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1072 movaps 0x70(%arg1), \TMP3 1073 AESENC \TMP3, \XMM1 # Round 7 1074 AESENC \TMP3, \XMM2 1075 AESENC \TMP3, \XMM3 1076 AESENC \TMP3, \XMM4 1077 movdqa HashKey_2_k(%arg2), \TMP5 1078 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1079 movaps 0x80(%arg1), \TMP3 1080 AESENC \TMP3, \XMM1 # Round 8 1081 AESENC \TMP3, \XMM2 1082 AESENC \TMP3, \XMM3 1083 AESENC \TMP3, \XMM4 1084 pxor \TMP1, \TMP4 1085# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1086 pxor \XMM7, \XMM5 1087 pxor \TMP2, \TMP6 1088 1089 # Multiply XMM8 * HashKey 1090 # XMM8 and TMP5 hold the values for the two operands 1091 1092 movdqa \XMM8, \TMP1 1093 pshufd $78, \XMM8, \TMP2 1094 pxor \XMM8, \TMP2 1095 movdqa HashKey(%arg2), \TMP5 1096 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1097 movaps 0x90(%arg1), \TMP3 1098 AESENC \TMP3, \XMM1 # Round 9 1099 AESENC \TMP3, \XMM2 1100 AESENC \TMP3, \XMM3 1101 AESENC \TMP3, \XMM4 1102 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1103 lea 0xa0(%arg1),%r10 1104 mov keysize,%eax 1105 shr $2,%eax # 128->4, 192->6, 256->8 1106 sub $4,%eax # 128->0, 192->2, 256->4 1107 jz aes_loop_par_enc_done\@ 1108 1109aes_loop_par_enc\@: 1110 MOVADQ (%r10),\TMP3 1111.irpc index, 1234 1112 AESENC \TMP3, %xmm\index 1113.endr 1114 add $16,%r10 1115 sub $1,%eax 1116 jnz aes_loop_par_enc\@ 1117 1118aes_loop_par_enc_done\@: 1119 MOVADQ (%r10), \TMP3 1120 AESENCLAST \TMP3, \XMM1 # Round 10 1121 AESENCLAST \TMP3, \XMM2 1122 AESENCLAST \TMP3, \XMM3 1123 AESENCLAST \TMP3, \XMM4 1124 movdqa HashKey_k(%arg2), \TMP5 1125 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1126 movdqu (%arg4,%r11,1), \TMP3 1127 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1128 movdqu 16(%arg4,%r11,1), \TMP3 1129 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1130 movdqu 32(%arg4,%r11,1), \TMP3 1131 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1132 movdqu 48(%arg4,%r11,1), \TMP3 1133 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1134 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1135 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1136 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1137 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1138 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1139 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1140 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1141 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1142 1143 pxor \TMP4, \TMP1 1144 pxor \XMM8, \XMM5 1145 pxor \TMP6, \TMP2 1146 pxor \TMP1, \TMP2 1147 pxor \XMM5, \TMP2 1148 movdqa \TMP2, \TMP3 1149 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1150 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1151 pxor \TMP3, \XMM5 1152 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1153 1154 # first phase of reduction 1155 1156 movdqa \XMM5, \TMP2 1157 movdqa \XMM5, \TMP3 1158 movdqa \XMM5, \TMP4 1159# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1160 pslld $31, \TMP2 # packed right shift << 31 1161 pslld $30, \TMP3 # packed right shift << 30 1162 pslld $25, \TMP4 # packed right shift << 25 1163 pxor \TMP3, \TMP2 # xor the shifted versions 1164 pxor \TMP4, \TMP2 1165 movdqa \TMP2, \TMP5 1166 psrldq $4, \TMP5 # right shift T5 1 DW 1167 pslldq $12, \TMP2 # left shift T2 3 DWs 1168 pxor \TMP2, \XMM5 1169 1170 # second phase of reduction 1171 1172 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1173 movdqa \XMM5,\TMP3 1174 movdqa \XMM5,\TMP4 1175 psrld $1, \TMP2 # packed left shift >>1 1176 psrld $2, \TMP3 # packed left shift >>2 1177 psrld $7, \TMP4 # packed left shift >>7 1178 pxor \TMP3,\TMP2 # xor the shifted versions 1179 pxor \TMP4,\TMP2 1180 pxor \TMP5, \TMP2 1181 pxor \TMP2, \XMM5 1182 pxor \TMP1, \XMM5 # result is in TMP1 1183 1184 pxor \XMM5, \XMM1 1185.endm 1186 1187/* 1188* decrypt 4 blocks at a time 1189* ghash the 4 previously decrypted ciphertext blocks 1190* arg1, %arg3, %arg4 are used as pointers only, not modified 1191* %r11 is the data offset value 1192*/ 1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1195 1196 movdqa \XMM1, \XMM5 1197 movdqa \XMM2, \XMM6 1198 movdqa \XMM3, \XMM7 1199 movdqa \XMM4, \XMM8 1200 1201 movdqa SHUF_MASK(%rip), %xmm15 1202 # multiply TMP5 * HashKey using karatsuba 1203 1204 movdqa \XMM5, \TMP4 1205 pshufd $78, \XMM5, \TMP6 1206 pxor \XMM5, \TMP6 1207 paddd ONE(%rip), \XMM0 # INCR CNT 1208 movdqa HashKey_4(%arg2), \TMP5 1209 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1210 movdqa \XMM0, \XMM1 1211 paddd ONE(%rip), \XMM0 # INCR CNT 1212 movdqa \XMM0, \XMM2 1213 paddd ONE(%rip), \XMM0 # INCR CNT 1214 movdqa \XMM0, \XMM3 1215 paddd ONE(%rip), \XMM0 # INCR CNT 1216 movdqa \XMM0, \XMM4 1217 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1218 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1219 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1220 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1221 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1222 1223 pxor (%arg1), \XMM1 1224 pxor (%arg1), \XMM2 1225 pxor (%arg1), \XMM3 1226 pxor (%arg1), \XMM4 1227 movdqa HashKey_4_k(%arg2), \TMP5 1228 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1229 movaps 0x10(%arg1), \TMP1 1230 AESENC \TMP1, \XMM1 # Round 1 1231 AESENC \TMP1, \XMM2 1232 AESENC \TMP1, \XMM3 1233 AESENC \TMP1, \XMM4 1234 movaps 0x20(%arg1), \TMP1 1235 AESENC \TMP1, \XMM1 # Round 2 1236 AESENC \TMP1, \XMM2 1237 AESENC \TMP1, \XMM3 1238 AESENC \TMP1, \XMM4 1239 movdqa \XMM6, \TMP1 1240 pshufd $78, \XMM6, \TMP2 1241 pxor \XMM6, \TMP2 1242 movdqa HashKey_3(%arg2), \TMP5 1243 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1244 movaps 0x30(%arg1), \TMP3 1245 AESENC \TMP3, \XMM1 # Round 3 1246 AESENC \TMP3, \XMM2 1247 AESENC \TMP3, \XMM3 1248 AESENC \TMP3, \XMM4 1249 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1250 movaps 0x40(%arg1), \TMP3 1251 AESENC \TMP3, \XMM1 # Round 4 1252 AESENC \TMP3, \XMM2 1253 AESENC \TMP3, \XMM3 1254 AESENC \TMP3, \XMM4 1255 movdqa HashKey_3_k(%arg2), \TMP5 1256 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1257 movaps 0x50(%arg1), \TMP3 1258 AESENC \TMP3, \XMM1 # Round 5 1259 AESENC \TMP3, \XMM2 1260 AESENC \TMP3, \XMM3 1261 AESENC \TMP3, \XMM4 1262 pxor \TMP1, \TMP4 1263# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1264 pxor \XMM6, \XMM5 1265 pxor \TMP2, \TMP6 1266 movdqa \XMM7, \TMP1 1267 pshufd $78, \XMM7, \TMP2 1268 pxor \XMM7, \TMP2 1269 movdqa HashKey_2(%arg2), \TMP5 1270 1271 # Multiply TMP5 * HashKey using karatsuba 1272 1273 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1274 movaps 0x60(%arg1), \TMP3 1275 AESENC \TMP3, \XMM1 # Round 6 1276 AESENC \TMP3, \XMM2 1277 AESENC \TMP3, \XMM3 1278 AESENC \TMP3, \XMM4 1279 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1280 movaps 0x70(%arg1), \TMP3 1281 AESENC \TMP3, \XMM1 # Round 7 1282 AESENC \TMP3, \XMM2 1283 AESENC \TMP3, \XMM3 1284 AESENC \TMP3, \XMM4 1285 movdqa HashKey_2_k(%arg2), \TMP5 1286 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1287 movaps 0x80(%arg1), \TMP3 1288 AESENC \TMP3, \XMM1 # Round 8 1289 AESENC \TMP3, \XMM2 1290 AESENC \TMP3, \XMM3 1291 AESENC \TMP3, \XMM4 1292 pxor \TMP1, \TMP4 1293# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1294 pxor \XMM7, \XMM5 1295 pxor \TMP2, \TMP6 1296 1297 # Multiply XMM8 * HashKey 1298 # XMM8 and TMP5 hold the values for the two operands 1299 1300 movdqa \XMM8, \TMP1 1301 pshufd $78, \XMM8, \TMP2 1302 pxor \XMM8, \TMP2 1303 movdqa HashKey(%arg2), \TMP5 1304 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1305 movaps 0x90(%arg1), \TMP3 1306 AESENC \TMP3, \XMM1 # Round 9 1307 AESENC \TMP3, \XMM2 1308 AESENC \TMP3, \XMM3 1309 AESENC \TMP3, \XMM4 1310 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1311 lea 0xa0(%arg1),%r10 1312 mov keysize,%eax 1313 shr $2,%eax # 128->4, 192->6, 256->8 1314 sub $4,%eax # 128->0, 192->2, 256->4 1315 jz aes_loop_par_dec_done\@ 1316 1317aes_loop_par_dec\@: 1318 MOVADQ (%r10),\TMP3 1319.irpc index, 1234 1320 AESENC \TMP3, %xmm\index 1321.endr 1322 add $16,%r10 1323 sub $1,%eax 1324 jnz aes_loop_par_dec\@ 1325 1326aes_loop_par_dec_done\@: 1327 MOVADQ (%r10), \TMP3 1328 AESENCLAST \TMP3, \XMM1 # last round 1329 AESENCLAST \TMP3, \XMM2 1330 AESENCLAST \TMP3, \XMM3 1331 AESENCLAST \TMP3, \XMM4 1332 movdqa HashKey_k(%arg2), \TMP5 1333 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1334 movdqu (%arg4,%r11,1), \TMP3 1335 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1336 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1337 movdqa \TMP3, \XMM1 1338 movdqu 16(%arg4,%r11,1), \TMP3 1339 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1340 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1341 movdqa \TMP3, \XMM2 1342 movdqu 32(%arg4,%r11,1), \TMP3 1343 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1344 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1345 movdqa \TMP3, \XMM3 1346 movdqu 48(%arg4,%r11,1), \TMP3 1347 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1348 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1349 movdqa \TMP3, \XMM4 1350 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1351 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1352 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1353 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1354 1355 pxor \TMP4, \TMP1 1356 pxor \XMM8, \XMM5 1357 pxor \TMP6, \TMP2 1358 pxor \TMP1, \TMP2 1359 pxor \XMM5, \TMP2 1360 movdqa \TMP2, \TMP3 1361 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1362 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1363 pxor \TMP3, \XMM5 1364 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1365 1366 # first phase of reduction 1367 1368 movdqa \XMM5, \TMP2 1369 movdqa \XMM5, \TMP3 1370 movdqa \XMM5, \TMP4 1371# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1372 pslld $31, \TMP2 # packed right shift << 31 1373 pslld $30, \TMP3 # packed right shift << 30 1374 pslld $25, \TMP4 # packed right shift << 25 1375 pxor \TMP3, \TMP2 # xor the shifted versions 1376 pxor \TMP4, \TMP2 1377 movdqa \TMP2, \TMP5 1378 psrldq $4, \TMP5 # right shift T5 1 DW 1379 pslldq $12, \TMP2 # left shift T2 3 DWs 1380 pxor \TMP2, \XMM5 1381 1382 # second phase of reduction 1383 1384 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1385 movdqa \XMM5,\TMP3 1386 movdqa \XMM5,\TMP4 1387 psrld $1, \TMP2 # packed left shift >>1 1388 psrld $2, \TMP3 # packed left shift >>2 1389 psrld $7, \TMP4 # packed left shift >>7 1390 pxor \TMP3,\TMP2 # xor the shifted versions 1391 pxor \TMP4,\TMP2 1392 pxor \TMP5, \TMP2 1393 pxor \TMP2, \XMM5 1394 pxor \TMP1, \XMM5 # result is in TMP1 1395 1396 pxor \XMM5, \XMM1 1397.endm 1398 1399/* GHASH the last 4 ciphertext blocks. */ 1400.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1402 1403 # Multiply TMP6 * HashKey (using Karatsuba) 1404 1405 movdqa \XMM1, \TMP6 1406 pshufd $78, \XMM1, \TMP2 1407 pxor \XMM1, \TMP2 1408 movdqa HashKey_4(%arg2), \TMP5 1409 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1410 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1411 movdqa HashKey_4_k(%arg2), \TMP4 1412 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1413 movdqa \XMM1, \XMMDst 1414 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1415 1416 # Multiply TMP1 * HashKey (using Karatsuba) 1417 1418 movdqa \XMM2, \TMP1 1419 pshufd $78, \XMM2, \TMP2 1420 pxor \XMM2, \TMP2 1421 movdqa HashKey_3(%arg2), \TMP5 1422 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1423 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1424 movdqa HashKey_3_k(%arg2), \TMP4 1425 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1426 pxor \TMP1, \TMP6 1427 pxor \XMM2, \XMMDst 1428 pxor \TMP2, \XMM1 1429# results accumulated in TMP6, XMMDst, XMM1 1430 1431 # Multiply TMP1 * HashKey (using Karatsuba) 1432 1433 movdqa \XMM3, \TMP1 1434 pshufd $78, \XMM3, \TMP2 1435 pxor \XMM3, \TMP2 1436 movdqa HashKey_2(%arg2), \TMP5 1437 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1438 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1439 movdqa HashKey_2_k(%arg2), \TMP4 1440 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1441 pxor \TMP1, \TMP6 1442 pxor \XMM3, \XMMDst 1443 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1444 1445 # Multiply TMP1 * HashKey (using Karatsuba) 1446 movdqa \XMM4, \TMP1 1447 pshufd $78, \XMM4, \TMP2 1448 pxor \XMM4, \TMP2 1449 movdqa HashKey(%arg2), \TMP5 1450 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1451 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1452 movdqa HashKey_k(%arg2), \TMP4 1453 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1454 pxor \TMP1, \TMP6 1455 pxor \XMM4, \XMMDst 1456 pxor \XMM1, \TMP2 1457 pxor \TMP6, \TMP2 1458 pxor \XMMDst, \TMP2 1459 # middle section of the temp results combined as in karatsuba algorithm 1460 movdqa \TMP2, \TMP4 1461 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1462 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1463 pxor \TMP4, \XMMDst 1464 pxor \TMP2, \TMP6 1465# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1466 # first phase of the reduction 1467 movdqa \XMMDst, \TMP2 1468 movdqa \XMMDst, \TMP3 1469 movdqa \XMMDst, \TMP4 1470# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1471 pslld $31, \TMP2 # packed right shifting << 31 1472 pslld $30, \TMP3 # packed right shifting << 30 1473 pslld $25, \TMP4 # packed right shifting << 25 1474 pxor \TMP3, \TMP2 # xor the shifted versions 1475 pxor \TMP4, \TMP2 1476 movdqa \TMP2, \TMP7 1477 psrldq $4, \TMP7 # right shift TMP7 1 DW 1478 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1479 pxor \TMP2, \XMMDst 1480 1481 # second phase of the reduction 1482 movdqa \XMMDst, \TMP2 1483 # make 3 copies of XMMDst for doing 3 shift operations 1484 movdqa \XMMDst, \TMP3 1485 movdqa \XMMDst, \TMP4 1486 psrld $1, \TMP2 # packed left shift >> 1 1487 psrld $2, \TMP3 # packed left shift >> 2 1488 psrld $7, \TMP4 # packed left shift >> 7 1489 pxor \TMP3, \TMP2 # xor the shifted versions 1490 pxor \TMP4, \TMP2 1491 pxor \TMP7, \TMP2 1492 pxor \TMP2, \XMMDst 1493 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1494.endm 1495 1496 1497/* Encryption of a single block 1498* uses eax & r10 1499*/ 1500 1501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1502 1503 pxor (%arg1), \XMM0 1504 mov keysize,%eax 1505 shr $2,%eax # 128->4, 192->6, 256->8 1506 add $5,%eax # 128->9, 192->11, 256->13 1507 lea 16(%arg1), %r10 # get first expanded key address 1508 1509_esb_loop_\@: 1510 MOVADQ (%r10),\TMP1 1511 AESENC \TMP1,\XMM0 1512 add $16,%r10 1513 sub $1,%eax 1514 jnz _esb_loop_\@ 1515 1516 MOVADQ (%r10),\TMP1 1517 AESENCLAST \TMP1,\XMM0 1518.endm 1519/***************************************************************************** 1520* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1521* struct gcm_context_data *data 1522* // Context data 1523* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1524* const u8 *in, // Ciphertext input 1525* u64 plaintext_len, // Length of data in bytes for decryption. 1526* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1527* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1528* // concatenated with 0x00000001. 16-byte aligned pointer. 1529* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1530* const u8 *aad, // Additional Authentication Data (AAD) 1531* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1532* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1533* // given authentication tag and only return the plaintext if they match. 1534* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1535* // (most likely), 12 or 8. 1536* 1537* Assumptions: 1538* 1539* keys: 1540* keys are pre-expanded and aligned to 16 bytes. we are using the first 1541* set of 11 keys in the data structure void *aes_ctx 1542* 1543* iv: 1544* 0 1 2 3 1545* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1546* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1547* | Salt (From the SA) | 1548* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1549* | Initialization Vector | 1550* | (This is the sequence number from IPSec header) | 1551* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1552* | 0x1 | 1553* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1554* 1555* 1556* 1557* AAD: 1558* AAD padded to 128 bits with 0 1559* for example, assume AAD is a u32 vector 1560* 1561* if AAD is 8 bytes: 1562* AAD[3] = {A0, A1}; 1563* padded AAD in xmm register = {A1 A0 0 0} 1564* 1565* 0 1 2 3 1566* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1568* | SPI (A1) | 1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1570* | 32-bit Sequence Number (A0) | 1571* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1572* | 0x0 | 1573* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1574* 1575* AAD Format with 32-bit Sequence Number 1576* 1577* if AAD is 12 bytes: 1578* AAD[3] = {A0, A1, A2}; 1579* padded AAD in xmm register = {A2 A1 A0 0} 1580* 1581* 0 1 2 3 1582* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1584* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1585* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1586* | SPI (A2) | 1587* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1588* | 64-bit Extended Sequence Number {A1,A0} | 1589* | | 1590* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1591* | 0x0 | 1592* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1593* 1594* AAD Format with 64-bit Extended Sequence Number 1595* 1596* poly = x^128 + x^127 + x^126 + x^121 + 1 1597* 1598*****************************************************************************/ 1599ENTRY(aesni_gcm_dec) 1600 FUNC_SAVE 1601 1602 GCM_INIT %arg6, arg7, arg8, arg9 1603 GCM_ENC_DEC dec 1604 GCM_COMPLETE arg10, arg11 1605 FUNC_RESTORE 1606 ret 1607ENDPROC(aesni_gcm_dec) 1608 1609 1610/***************************************************************************** 1611* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1612* struct gcm_context_data *data 1613* // Context data 1614* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1615* const u8 *in, // Plaintext input 1616* u64 plaintext_len, // Length of data in bytes for encryption. 1617* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1618* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1619* // concatenated with 0x00000001. 16-byte aligned pointer. 1620* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1621* const u8 *aad, // Additional Authentication Data (AAD) 1622* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1623* u8 *auth_tag, // Authenticated Tag output. 1624* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1625* // 12 or 8. 1626* 1627* Assumptions: 1628* 1629* keys: 1630* keys are pre-expanded and aligned to 16 bytes. we are using the 1631* first set of 11 keys in the data structure void *aes_ctx 1632* 1633* 1634* iv: 1635* 0 1 2 3 1636* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1637* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1638* | Salt (From the SA) | 1639* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1640* | Initialization Vector | 1641* | (This is the sequence number from IPSec header) | 1642* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1643* | 0x1 | 1644* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1645* 1646* 1647* 1648* AAD: 1649* AAD padded to 128 bits with 0 1650* for example, assume AAD is a u32 vector 1651* 1652* if AAD is 8 bytes: 1653* AAD[3] = {A0, A1}; 1654* padded AAD in xmm register = {A1 A0 0 0} 1655* 1656* 0 1 2 3 1657* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1659* | SPI (A1) | 1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1661* | 32-bit Sequence Number (A0) | 1662* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1663* | 0x0 | 1664* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1665* 1666* AAD Format with 32-bit Sequence Number 1667* 1668* if AAD is 12 bytes: 1669* AAD[3] = {A0, A1, A2}; 1670* padded AAD in xmm register = {A2 A1 A0 0} 1671* 1672* 0 1 2 3 1673* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1674* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1675* | SPI (A2) | 1676* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1677* | 64-bit Extended Sequence Number {A1,A0} | 1678* | | 1679* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1680* | 0x0 | 1681* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1682* 1683* AAD Format with 64-bit Extended Sequence Number 1684* 1685* poly = x^128 + x^127 + x^126 + x^121 + 1 1686***************************************************************************/ 1687ENTRY(aesni_gcm_enc) 1688 FUNC_SAVE 1689 1690 GCM_INIT %arg6, arg7, arg8, arg9 1691 GCM_ENC_DEC enc 1692 1693 GCM_COMPLETE arg10, arg11 1694 FUNC_RESTORE 1695 ret 1696ENDPROC(aesni_gcm_enc) 1697 1698/***************************************************************************** 1699* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1700* struct gcm_context_data *data, 1701* // context data 1702* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1703* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1704* // concatenated with 0x00000001. 16-byte aligned pointer. 1705* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1706* const u8 *aad, // Additional Authentication Data (AAD) 1707* u64 aad_len) // Length of AAD in bytes. 1708*/ 1709ENTRY(aesni_gcm_init) 1710 FUNC_SAVE 1711 GCM_INIT %arg3, %arg4,%arg5, %arg6 1712 FUNC_RESTORE 1713 ret 1714ENDPROC(aesni_gcm_init) 1715 1716/***************************************************************************** 1717* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1718* struct gcm_context_data *data, 1719* // context data 1720* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1721* const u8 *in, // Plaintext input 1722* u64 plaintext_len, // Length of data in bytes for encryption. 1723*/ 1724ENTRY(aesni_gcm_enc_update) 1725 FUNC_SAVE 1726 GCM_ENC_DEC enc 1727 FUNC_RESTORE 1728 ret 1729ENDPROC(aesni_gcm_enc_update) 1730 1731/***************************************************************************** 1732* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1733* struct gcm_context_data *data, 1734* // context data 1735* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1736* const u8 *in, // Plaintext input 1737* u64 plaintext_len, // Length of data in bytes for encryption. 1738*/ 1739ENTRY(aesni_gcm_dec_update) 1740 FUNC_SAVE 1741 GCM_ENC_DEC dec 1742 FUNC_RESTORE 1743 ret 1744ENDPROC(aesni_gcm_dec_update) 1745 1746/***************************************************************************** 1747* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1748* struct gcm_context_data *data, 1749* // context data 1750* u8 *auth_tag, // Authenticated Tag output. 1751* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1752* // 12 or 8. 1753*/ 1754ENTRY(aesni_gcm_finalize) 1755 FUNC_SAVE 1756 GCM_COMPLETE %arg3 %arg4 1757 FUNC_RESTORE 1758 ret 1759ENDPROC(aesni_gcm_finalize) 1760 1761#endif 1762 1763 1764.align 4 1765_key_expansion_128: 1766_key_expansion_256a: 1767 pshufd $0b11111111, %xmm1, %xmm1 1768 shufps $0b00010000, %xmm0, %xmm4 1769 pxor %xmm4, %xmm0 1770 shufps $0b10001100, %xmm0, %xmm4 1771 pxor %xmm4, %xmm0 1772 pxor %xmm1, %xmm0 1773 movaps %xmm0, (TKEYP) 1774 add $0x10, TKEYP 1775 ret 1776ENDPROC(_key_expansion_128) 1777ENDPROC(_key_expansion_256a) 1778 1779.align 4 1780_key_expansion_192a: 1781 pshufd $0b01010101, %xmm1, %xmm1 1782 shufps $0b00010000, %xmm0, %xmm4 1783 pxor %xmm4, %xmm0 1784 shufps $0b10001100, %xmm0, %xmm4 1785 pxor %xmm4, %xmm0 1786 pxor %xmm1, %xmm0 1787 1788 movaps %xmm2, %xmm5 1789 movaps %xmm2, %xmm6 1790 pslldq $4, %xmm5 1791 pshufd $0b11111111, %xmm0, %xmm3 1792 pxor %xmm3, %xmm2 1793 pxor %xmm5, %xmm2 1794 1795 movaps %xmm0, %xmm1 1796 shufps $0b01000100, %xmm0, %xmm6 1797 movaps %xmm6, (TKEYP) 1798 shufps $0b01001110, %xmm2, %xmm1 1799 movaps %xmm1, 0x10(TKEYP) 1800 add $0x20, TKEYP 1801 ret 1802ENDPROC(_key_expansion_192a) 1803 1804.align 4 1805_key_expansion_192b: 1806 pshufd $0b01010101, %xmm1, %xmm1 1807 shufps $0b00010000, %xmm0, %xmm4 1808 pxor %xmm4, %xmm0 1809 shufps $0b10001100, %xmm0, %xmm4 1810 pxor %xmm4, %xmm0 1811 pxor %xmm1, %xmm0 1812 1813 movaps %xmm2, %xmm5 1814 pslldq $4, %xmm5 1815 pshufd $0b11111111, %xmm0, %xmm3 1816 pxor %xmm3, %xmm2 1817 pxor %xmm5, %xmm2 1818 1819 movaps %xmm0, (TKEYP) 1820 add $0x10, TKEYP 1821 ret 1822ENDPROC(_key_expansion_192b) 1823 1824.align 4 1825_key_expansion_256b: 1826 pshufd $0b10101010, %xmm1, %xmm1 1827 shufps $0b00010000, %xmm2, %xmm4 1828 pxor %xmm4, %xmm2 1829 shufps $0b10001100, %xmm2, %xmm4 1830 pxor %xmm4, %xmm2 1831 pxor %xmm1, %xmm2 1832 movaps %xmm2, (TKEYP) 1833 add $0x10, TKEYP 1834 ret 1835ENDPROC(_key_expansion_256b) 1836 1837/* 1838 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1839 * unsigned int key_len) 1840 */ 1841ENTRY(aesni_set_key) 1842 FRAME_BEGIN 1843#ifndef __x86_64__ 1844 pushl KEYP 1845 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1846 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1847 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1848#endif 1849 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1850 movaps %xmm0, (KEYP) 1851 lea 0x10(KEYP), TKEYP # key addr 1852 movl %edx, 480(KEYP) 1853 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1854 cmp $24, %dl 1855 jb .Lenc_key128 1856 je .Lenc_key192 1857 movups 0x10(UKEYP), %xmm2 # other user key 1858 movaps %xmm2, (TKEYP) 1859 add $0x10, TKEYP 1860 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1861 call _key_expansion_256a 1862 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1863 call _key_expansion_256b 1864 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1865 call _key_expansion_256a 1866 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1867 call _key_expansion_256b 1868 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1869 call _key_expansion_256a 1870 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1871 call _key_expansion_256b 1872 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1873 call _key_expansion_256a 1874 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1875 call _key_expansion_256b 1876 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1877 call _key_expansion_256a 1878 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1879 call _key_expansion_256b 1880 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1881 call _key_expansion_256a 1882 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1883 call _key_expansion_256b 1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1885 call _key_expansion_256a 1886 jmp .Ldec_key 1887.Lenc_key192: 1888 movq 0x10(UKEYP), %xmm2 # other user key 1889 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1890 call _key_expansion_192a 1891 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1892 call _key_expansion_192b 1893 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1894 call _key_expansion_192a 1895 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1896 call _key_expansion_192b 1897 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1898 call _key_expansion_192a 1899 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1900 call _key_expansion_192b 1901 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1902 call _key_expansion_192a 1903 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1904 call _key_expansion_192b 1905 jmp .Ldec_key 1906.Lenc_key128: 1907 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1908 call _key_expansion_128 1909 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1910 call _key_expansion_128 1911 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1912 call _key_expansion_128 1913 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1914 call _key_expansion_128 1915 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1916 call _key_expansion_128 1917 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1918 call _key_expansion_128 1919 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1920 call _key_expansion_128 1921 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1922 call _key_expansion_128 1923 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1924 call _key_expansion_128 1925 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1926 call _key_expansion_128 1927.Ldec_key: 1928 sub $0x10, TKEYP 1929 movaps (KEYP), %xmm0 1930 movaps (TKEYP), %xmm1 1931 movaps %xmm0, 240(TKEYP) 1932 movaps %xmm1, 240(KEYP) 1933 add $0x10, KEYP 1934 lea 240-16(TKEYP), UKEYP 1935.align 4 1936.Ldec_key_loop: 1937 movaps (KEYP), %xmm0 1938 AESIMC %xmm0 %xmm1 1939 movaps %xmm1, (UKEYP) 1940 add $0x10, KEYP 1941 sub $0x10, UKEYP 1942 cmp TKEYP, KEYP 1943 jb .Ldec_key_loop 1944 xor AREG, AREG 1945#ifndef __x86_64__ 1946 popl KEYP 1947#endif 1948 FRAME_END 1949 ret 1950ENDPROC(aesni_set_key) 1951 1952/* 1953 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1954 */ 1955ENTRY(aesni_enc) 1956 FRAME_BEGIN 1957#ifndef __x86_64__ 1958 pushl KEYP 1959 pushl KLEN 1960 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1961 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1962 movl (FRAME_OFFSET+20)(%esp), INP # src 1963#endif 1964 movl 480(KEYP), KLEN # key length 1965 movups (INP), STATE # input 1966 call _aesni_enc1 1967 movups STATE, (OUTP) # output 1968#ifndef __x86_64__ 1969 popl KLEN 1970 popl KEYP 1971#endif 1972 FRAME_END 1973 ret 1974ENDPROC(aesni_enc) 1975 1976/* 1977 * _aesni_enc1: internal ABI 1978 * input: 1979 * KEYP: key struct pointer 1980 * KLEN: round count 1981 * STATE: initial state (input) 1982 * output: 1983 * STATE: finial state (output) 1984 * changed: 1985 * KEY 1986 * TKEYP (T1) 1987 */ 1988.align 4 1989_aesni_enc1: 1990 movaps (KEYP), KEY # key 1991 mov KEYP, TKEYP 1992 pxor KEY, STATE # round 0 1993 add $0x30, TKEYP 1994 cmp $24, KLEN 1995 jb .Lenc128 1996 lea 0x20(TKEYP), TKEYP 1997 je .Lenc192 1998 add $0x20, TKEYP 1999 movaps -0x60(TKEYP), KEY 2000 AESENC KEY STATE 2001 movaps -0x50(TKEYP), KEY 2002 AESENC KEY STATE 2003.align 4 2004.Lenc192: 2005 movaps -0x40(TKEYP), KEY 2006 AESENC KEY STATE 2007 movaps -0x30(TKEYP), KEY 2008 AESENC KEY STATE 2009.align 4 2010.Lenc128: 2011 movaps -0x20(TKEYP), KEY 2012 AESENC KEY STATE 2013 movaps -0x10(TKEYP), KEY 2014 AESENC KEY STATE 2015 movaps (TKEYP), KEY 2016 AESENC KEY STATE 2017 movaps 0x10(TKEYP), KEY 2018 AESENC KEY STATE 2019 movaps 0x20(TKEYP), KEY 2020 AESENC KEY STATE 2021 movaps 0x30(TKEYP), KEY 2022 AESENC KEY STATE 2023 movaps 0x40(TKEYP), KEY 2024 AESENC KEY STATE 2025 movaps 0x50(TKEYP), KEY 2026 AESENC KEY STATE 2027 movaps 0x60(TKEYP), KEY 2028 AESENC KEY STATE 2029 movaps 0x70(TKEYP), KEY 2030 AESENCLAST KEY STATE 2031 ret 2032ENDPROC(_aesni_enc1) 2033 2034/* 2035 * _aesni_enc4: internal ABI 2036 * input: 2037 * KEYP: key struct pointer 2038 * KLEN: round count 2039 * STATE1: initial state (input) 2040 * STATE2 2041 * STATE3 2042 * STATE4 2043 * output: 2044 * STATE1: finial state (output) 2045 * STATE2 2046 * STATE3 2047 * STATE4 2048 * changed: 2049 * KEY 2050 * TKEYP (T1) 2051 */ 2052.align 4 2053_aesni_enc4: 2054 movaps (KEYP), KEY # key 2055 mov KEYP, TKEYP 2056 pxor KEY, STATE1 # round 0 2057 pxor KEY, STATE2 2058 pxor KEY, STATE3 2059 pxor KEY, STATE4 2060 add $0x30, TKEYP 2061 cmp $24, KLEN 2062 jb .L4enc128 2063 lea 0x20(TKEYP), TKEYP 2064 je .L4enc192 2065 add $0x20, TKEYP 2066 movaps -0x60(TKEYP), KEY 2067 AESENC KEY STATE1 2068 AESENC KEY STATE2 2069 AESENC KEY STATE3 2070 AESENC KEY STATE4 2071 movaps -0x50(TKEYP), KEY 2072 AESENC KEY STATE1 2073 AESENC KEY STATE2 2074 AESENC KEY STATE3 2075 AESENC KEY STATE4 2076#.align 4 2077.L4enc192: 2078 movaps -0x40(TKEYP), KEY 2079 AESENC KEY STATE1 2080 AESENC KEY STATE2 2081 AESENC KEY STATE3 2082 AESENC KEY STATE4 2083 movaps -0x30(TKEYP), KEY 2084 AESENC KEY STATE1 2085 AESENC KEY STATE2 2086 AESENC KEY STATE3 2087 AESENC KEY STATE4 2088#.align 4 2089.L4enc128: 2090 movaps -0x20(TKEYP), KEY 2091 AESENC KEY STATE1 2092 AESENC KEY STATE2 2093 AESENC KEY STATE3 2094 AESENC KEY STATE4 2095 movaps -0x10(TKEYP), KEY 2096 AESENC KEY STATE1 2097 AESENC KEY STATE2 2098 AESENC KEY STATE3 2099 AESENC KEY STATE4 2100 movaps (TKEYP), KEY 2101 AESENC KEY STATE1 2102 AESENC KEY STATE2 2103 AESENC KEY STATE3 2104 AESENC KEY STATE4 2105 movaps 0x10(TKEYP), KEY 2106 AESENC KEY STATE1 2107 AESENC KEY STATE2 2108 AESENC KEY STATE3 2109 AESENC KEY STATE4 2110 movaps 0x20(TKEYP), KEY 2111 AESENC KEY STATE1 2112 AESENC KEY STATE2 2113 AESENC KEY STATE3 2114 AESENC KEY STATE4 2115 movaps 0x30(TKEYP), KEY 2116 AESENC KEY STATE1 2117 AESENC KEY STATE2 2118 AESENC KEY STATE3 2119 AESENC KEY STATE4 2120 movaps 0x40(TKEYP), KEY 2121 AESENC KEY STATE1 2122 AESENC KEY STATE2 2123 AESENC KEY STATE3 2124 AESENC KEY STATE4 2125 movaps 0x50(TKEYP), KEY 2126 AESENC KEY STATE1 2127 AESENC KEY STATE2 2128 AESENC KEY STATE3 2129 AESENC KEY STATE4 2130 movaps 0x60(TKEYP), KEY 2131 AESENC KEY STATE1 2132 AESENC KEY STATE2 2133 AESENC KEY STATE3 2134 AESENC KEY STATE4 2135 movaps 0x70(TKEYP), KEY 2136 AESENCLAST KEY STATE1 # last round 2137 AESENCLAST KEY STATE2 2138 AESENCLAST KEY STATE3 2139 AESENCLAST KEY STATE4 2140 ret 2141ENDPROC(_aesni_enc4) 2142 2143/* 2144 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2145 */ 2146ENTRY(aesni_dec) 2147 FRAME_BEGIN 2148#ifndef __x86_64__ 2149 pushl KEYP 2150 pushl KLEN 2151 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2152 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2153 movl (FRAME_OFFSET+20)(%esp), INP # src 2154#endif 2155 mov 480(KEYP), KLEN # key length 2156 add $240, KEYP 2157 movups (INP), STATE # input 2158 call _aesni_dec1 2159 movups STATE, (OUTP) #output 2160#ifndef __x86_64__ 2161 popl KLEN 2162 popl KEYP 2163#endif 2164 FRAME_END 2165 ret 2166ENDPROC(aesni_dec) 2167 2168/* 2169 * _aesni_dec1: internal ABI 2170 * input: 2171 * KEYP: key struct pointer 2172 * KLEN: key length 2173 * STATE: initial state (input) 2174 * output: 2175 * STATE: finial state (output) 2176 * changed: 2177 * KEY 2178 * TKEYP (T1) 2179 */ 2180.align 4 2181_aesni_dec1: 2182 movaps (KEYP), KEY # key 2183 mov KEYP, TKEYP 2184 pxor KEY, STATE # round 0 2185 add $0x30, TKEYP 2186 cmp $24, KLEN 2187 jb .Ldec128 2188 lea 0x20(TKEYP), TKEYP 2189 je .Ldec192 2190 add $0x20, TKEYP 2191 movaps -0x60(TKEYP), KEY 2192 AESDEC KEY STATE 2193 movaps -0x50(TKEYP), KEY 2194 AESDEC KEY STATE 2195.align 4 2196.Ldec192: 2197 movaps -0x40(TKEYP), KEY 2198 AESDEC KEY STATE 2199 movaps -0x30(TKEYP), KEY 2200 AESDEC KEY STATE 2201.align 4 2202.Ldec128: 2203 movaps -0x20(TKEYP), KEY 2204 AESDEC KEY STATE 2205 movaps -0x10(TKEYP), KEY 2206 AESDEC KEY STATE 2207 movaps (TKEYP), KEY 2208 AESDEC KEY STATE 2209 movaps 0x10(TKEYP), KEY 2210 AESDEC KEY STATE 2211 movaps 0x20(TKEYP), KEY 2212 AESDEC KEY STATE 2213 movaps 0x30(TKEYP), KEY 2214 AESDEC KEY STATE 2215 movaps 0x40(TKEYP), KEY 2216 AESDEC KEY STATE 2217 movaps 0x50(TKEYP), KEY 2218 AESDEC KEY STATE 2219 movaps 0x60(TKEYP), KEY 2220 AESDEC KEY STATE 2221 movaps 0x70(TKEYP), KEY 2222 AESDECLAST KEY STATE 2223 ret 2224ENDPROC(_aesni_dec1) 2225 2226/* 2227 * _aesni_dec4: internal ABI 2228 * input: 2229 * KEYP: key struct pointer 2230 * KLEN: key length 2231 * STATE1: initial state (input) 2232 * STATE2 2233 * STATE3 2234 * STATE4 2235 * output: 2236 * STATE1: finial state (output) 2237 * STATE2 2238 * STATE3 2239 * STATE4 2240 * changed: 2241 * KEY 2242 * TKEYP (T1) 2243 */ 2244.align 4 2245_aesni_dec4: 2246 movaps (KEYP), KEY # key 2247 mov KEYP, TKEYP 2248 pxor KEY, STATE1 # round 0 2249 pxor KEY, STATE2 2250 pxor KEY, STATE3 2251 pxor KEY, STATE4 2252 add $0x30, TKEYP 2253 cmp $24, KLEN 2254 jb .L4dec128 2255 lea 0x20(TKEYP), TKEYP 2256 je .L4dec192 2257 add $0x20, TKEYP 2258 movaps -0x60(TKEYP), KEY 2259 AESDEC KEY STATE1 2260 AESDEC KEY STATE2 2261 AESDEC KEY STATE3 2262 AESDEC KEY STATE4 2263 movaps -0x50(TKEYP), KEY 2264 AESDEC KEY STATE1 2265 AESDEC KEY STATE2 2266 AESDEC KEY STATE3 2267 AESDEC KEY STATE4 2268.align 4 2269.L4dec192: 2270 movaps -0x40(TKEYP), KEY 2271 AESDEC KEY STATE1 2272 AESDEC KEY STATE2 2273 AESDEC KEY STATE3 2274 AESDEC KEY STATE4 2275 movaps -0x30(TKEYP), KEY 2276 AESDEC KEY STATE1 2277 AESDEC KEY STATE2 2278 AESDEC KEY STATE3 2279 AESDEC KEY STATE4 2280.align 4 2281.L4dec128: 2282 movaps -0x20(TKEYP), KEY 2283 AESDEC KEY STATE1 2284 AESDEC KEY STATE2 2285 AESDEC KEY STATE3 2286 AESDEC KEY STATE4 2287 movaps -0x10(TKEYP), KEY 2288 AESDEC KEY STATE1 2289 AESDEC KEY STATE2 2290 AESDEC KEY STATE3 2291 AESDEC KEY STATE4 2292 movaps (TKEYP), KEY 2293 AESDEC KEY STATE1 2294 AESDEC KEY STATE2 2295 AESDEC KEY STATE3 2296 AESDEC KEY STATE4 2297 movaps 0x10(TKEYP), KEY 2298 AESDEC KEY STATE1 2299 AESDEC KEY STATE2 2300 AESDEC KEY STATE3 2301 AESDEC KEY STATE4 2302 movaps 0x20(TKEYP), KEY 2303 AESDEC KEY STATE1 2304 AESDEC KEY STATE2 2305 AESDEC KEY STATE3 2306 AESDEC KEY STATE4 2307 movaps 0x30(TKEYP), KEY 2308 AESDEC KEY STATE1 2309 AESDEC KEY STATE2 2310 AESDEC KEY STATE3 2311 AESDEC KEY STATE4 2312 movaps 0x40(TKEYP), KEY 2313 AESDEC KEY STATE1 2314 AESDEC KEY STATE2 2315 AESDEC KEY STATE3 2316 AESDEC KEY STATE4 2317 movaps 0x50(TKEYP), KEY 2318 AESDEC KEY STATE1 2319 AESDEC KEY STATE2 2320 AESDEC KEY STATE3 2321 AESDEC KEY STATE4 2322 movaps 0x60(TKEYP), KEY 2323 AESDEC KEY STATE1 2324 AESDEC KEY STATE2 2325 AESDEC KEY STATE3 2326 AESDEC KEY STATE4 2327 movaps 0x70(TKEYP), KEY 2328 AESDECLAST KEY STATE1 # last round 2329 AESDECLAST KEY STATE2 2330 AESDECLAST KEY STATE3 2331 AESDECLAST KEY STATE4 2332 ret 2333ENDPROC(_aesni_dec4) 2334 2335/* 2336 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2337 * size_t len) 2338 */ 2339ENTRY(aesni_ecb_enc) 2340 FRAME_BEGIN 2341#ifndef __x86_64__ 2342 pushl LEN 2343 pushl KEYP 2344 pushl KLEN 2345 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2346 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2347 movl (FRAME_OFFSET+24)(%esp), INP # src 2348 movl (FRAME_OFFSET+28)(%esp), LEN # len 2349#endif 2350 test LEN, LEN # check length 2351 jz .Lecb_enc_ret 2352 mov 480(KEYP), KLEN 2353 cmp $16, LEN 2354 jb .Lecb_enc_ret 2355 cmp $64, LEN 2356 jb .Lecb_enc_loop1 2357.align 4 2358.Lecb_enc_loop4: 2359 movups (INP), STATE1 2360 movups 0x10(INP), STATE2 2361 movups 0x20(INP), STATE3 2362 movups 0x30(INP), STATE4 2363 call _aesni_enc4 2364 movups STATE1, (OUTP) 2365 movups STATE2, 0x10(OUTP) 2366 movups STATE3, 0x20(OUTP) 2367 movups STATE4, 0x30(OUTP) 2368 sub $64, LEN 2369 add $64, INP 2370 add $64, OUTP 2371 cmp $64, LEN 2372 jge .Lecb_enc_loop4 2373 cmp $16, LEN 2374 jb .Lecb_enc_ret 2375.align 4 2376.Lecb_enc_loop1: 2377 movups (INP), STATE1 2378 call _aesni_enc1 2379 movups STATE1, (OUTP) 2380 sub $16, LEN 2381 add $16, INP 2382 add $16, OUTP 2383 cmp $16, LEN 2384 jge .Lecb_enc_loop1 2385.Lecb_enc_ret: 2386#ifndef __x86_64__ 2387 popl KLEN 2388 popl KEYP 2389 popl LEN 2390#endif 2391 FRAME_END 2392 ret 2393ENDPROC(aesni_ecb_enc) 2394 2395/* 2396 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2397 * size_t len); 2398 */ 2399ENTRY(aesni_ecb_dec) 2400 FRAME_BEGIN 2401#ifndef __x86_64__ 2402 pushl LEN 2403 pushl KEYP 2404 pushl KLEN 2405 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2406 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2407 movl (FRAME_OFFSET+24)(%esp), INP # src 2408 movl (FRAME_OFFSET+28)(%esp), LEN # len 2409#endif 2410 test LEN, LEN 2411 jz .Lecb_dec_ret 2412 mov 480(KEYP), KLEN 2413 add $240, KEYP 2414 cmp $16, LEN 2415 jb .Lecb_dec_ret 2416 cmp $64, LEN 2417 jb .Lecb_dec_loop1 2418.align 4 2419.Lecb_dec_loop4: 2420 movups (INP), STATE1 2421 movups 0x10(INP), STATE2 2422 movups 0x20(INP), STATE3 2423 movups 0x30(INP), STATE4 2424 call _aesni_dec4 2425 movups STATE1, (OUTP) 2426 movups STATE2, 0x10(OUTP) 2427 movups STATE3, 0x20(OUTP) 2428 movups STATE4, 0x30(OUTP) 2429 sub $64, LEN 2430 add $64, INP 2431 add $64, OUTP 2432 cmp $64, LEN 2433 jge .Lecb_dec_loop4 2434 cmp $16, LEN 2435 jb .Lecb_dec_ret 2436.align 4 2437.Lecb_dec_loop1: 2438 movups (INP), STATE1 2439 call _aesni_dec1 2440 movups STATE1, (OUTP) 2441 sub $16, LEN 2442 add $16, INP 2443 add $16, OUTP 2444 cmp $16, LEN 2445 jge .Lecb_dec_loop1 2446.Lecb_dec_ret: 2447#ifndef __x86_64__ 2448 popl KLEN 2449 popl KEYP 2450 popl LEN 2451#endif 2452 FRAME_END 2453 ret 2454ENDPROC(aesni_ecb_dec) 2455 2456/* 2457 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2458 * size_t len, u8 *iv) 2459 */ 2460ENTRY(aesni_cbc_enc) 2461 FRAME_BEGIN 2462#ifndef __x86_64__ 2463 pushl IVP 2464 pushl LEN 2465 pushl KEYP 2466 pushl KLEN 2467 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2468 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2469 movl (FRAME_OFFSET+28)(%esp), INP # src 2470 movl (FRAME_OFFSET+32)(%esp), LEN # len 2471 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2472#endif 2473 cmp $16, LEN 2474 jb .Lcbc_enc_ret 2475 mov 480(KEYP), KLEN 2476 movups (IVP), STATE # load iv as initial state 2477.align 4 2478.Lcbc_enc_loop: 2479 movups (INP), IN # load input 2480 pxor IN, STATE 2481 call _aesni_enc1 2482 movups STATE, (OUTP) # store output 2483 sub $16, LEN 2484 add $16, INP 2485 add $16, OUTP 2486 cmp $16, LEN 2487 jge .Lcbc_enc_loop 2488 movups STATE, (IVP) 2489.Lcbc_enc_ret: 2490#ifndef __x86_64__ 2491 popl KLEN 2492 popl KEYP 2493 popl LEN 2494 popl IVP 2495#endif 2496 FRAME_END 2497 ret 2498ENDPROC(aesni_cbc_enc) 2499 2500/* 2501 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2502 * size_t len, u8 *iv) 2503 */ 2504ENTRY(aesni_cbc_dec) 2505 FRAME_BEGIN 2506#ifndef __x86_64__ 2507 pushl IVP 2508 pushl LEN 2509 pushl KEYP 2510 pushl KLEN 2511 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2512 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2513 movl (FRAME_OFFSET+28)(%esp), INP # src 2514 movl (FRAME_OFFSET+32)(%esp), LEN # len 2515 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2516#endif 2517 cmp $16, LEN 2518 jb .Lcbc_dec_just_ret 2519 mov 480(KEYP), KLEN 2520 add $240, KEYP 2521 movups (IVP), IV 2522 cmp $64, LEN 2523 jb .Lcbc_dec_loop1 2524.align 4 2525.Lcbc_dec_loop4: 2526 movups (INP), IN1 2527 movaps IN1, STATE1 2528 movups 0x10(INP), IN2 2529 movaps IN2, STATE2 2530#ifdef __x86_64__ 2531 movups 0x20(INP), IN3 2532 movaps IN3, STATE3 2533 movups 0x30(INP), IN4 2534 movaps IN4, STATE4 2535#else 2536 movups 0x20(INP), IN1 2537 movaps IN1, STATE3 2538 movups 0x30(INP), IN2 2539 movaps IN2, STATE4 2540#endif 2541 call _aesni_dec4 2542 pxor IV, STATE1 2543#ifdef __x86_64__ 2544 pxor IN1, STATE2 2545 pxor IN2, STATE3 2546 pxor IN3, STATE4 2547 movaps IN4, IV 2548#else 2549 pxor IN1, STATE4 2550 movaps IN2, IV 2551 movups (INP), IN1 2552 pxor IN1, STATE2 2553 movups 0x10(INP), IN2 2554 pxor IN2, STATE3 2555#endif 2556 movups STATE1, (OUTP) 2557 movups STATE2, 0x10(OUTP) 2558 movups STATE3, 0x20(OUTP) 2559 movups STATE4, 0x30(OUTP) 2560 sub $64, LEN 2561 add $64, INP 2562 add $64, OUTP 2563 cmp $64, LEN 2564 jge .Lcbc_dec_loop4 2565 cmp $16, LEN 2566 jb .Lcbc_dec_ret 2567.align 4 2568.Lcbc_dec_loop1: 2569 movups (INP), IN 2570 movaps IN, STATE 2571 call _aesni_dec1 2572 pxor IV, STATE 2573 movups STATE, (OUTP) 2574 movaps IN, IV 2575 sub $16, LEN 2576 add $16, INP 2577 add $16, OUTP 2578 cmp $16, LEN 2579 jge .Lcbc_dec_loop1 2580.Lcbc_dec_ret: 2581 movups IV, (IVP) 2582.Lcbc_dec_just_ret: 2583#ifndef __x86_64__ 2584 popl KLEN 2585 popl KEYP 2586 popl LEN 2587 popl IVP 2588#endif 2589 FRAME_END 2590 ret 2591ENDPROC(aesni_cbc_dec) 2592 2593#ifdef __x86_64__ 2594.pushsection .rodata 2595.align 16 2596.Lbswap_mask: 2597 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2598.popsection 2599 2600/* 2601 * _aesni_inc_init: internal ABI 2602 * setup registers used by _aesni_inc 2603 * input: 2604 * IV 2605 * output: 2606 * CTR: == IV, in little endian 2607 * TCTR_LOW: == lower qword of CTR 2608 * INC: == 1, in little endian 2609 * BSWAP_MASK == endian swapping mask 2610 */ 2611.align 4 2612_aesni_inc_init: 2613 movaps .Lbswap_mask, BSWAP_MASK 2614 movaps IV, CTR 2615 PSHUFB_XMM BSWAP_MASK CTR 2616 mov $1, TCTR_LOW 2617 MOVQ_R64_XMM TCTR_LOW INC 2618 MOVQ_R64_XMM CTR TCTR_LOW 2619 ret 2620ENDPROC(_aesni_inc_init) 2621 2622/* 2623 * _aesni_inc: internal ABI 2624 * Increase IV by 1, IV is in big endian 2625 * input: 2626 * IV 2627 * CTR: == IV, in little endian 2628 * TCTR_LOW: == lower qword of CTR 2629 * INC: == 1, in little endian 2630 * BSWAP_MASK == endian swapping mask 2631 * output: 2632 * IV: Increase by 1 2633 * changed: 2634 * CTR: == output IV, in little endian 2635 * TCTR_LOW: == lower qword of CTR 2636 */ 2637.align 4 2638_aesni_inc: 2639 paddq INC, CTR 2640 add $1, TCTR_LOW 2641 jnc .Linc_low 2642 pslldq $8, INC 2643 paddq INC, CTR 2644 psrldq $8, INC 2645.Linc_low: 2646 movaps CTR, IV 2647 PSHUFB_XMM BSWAP_MASK IV 2648 ret 2649ENDPROC(_aesni_inc) 2650 2651/* 2652 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2653 * size_t len, u8 *iv) 2654 */ 2655ENTRY(aesni_ctr_enc) 2656 FRAME_BEGIN 2657 cmp $16, LEN 2658 jb .Lctr_enc_just_ret 2659 mov 480(KEYP), KLEN 2660 movups (IVP), IV 2661 call _aesni_inc_init 2662 cmp $64, LEN 2663 jb .Lctr_enc_loop1 2664.align 4 2665.Lctr_enc_loop4: 2666 movaps IV, STATE1 2667 call _aesni_inc 2668 movups (INP), IN1 2669 movaps IV, STATE2 2670 call _aesni_inc 2671 movups 0x10(INP), IN2 2672 movaps IV, STATE3 2673 call _aesni_inc 2674 movups 0x20(INP), IN3 2675 movaps IV, STATE4 2676 call _aesni_inc 2677 movups 0x30(INP), IN4 2678 call _aesni_enc4 2679 pxor IN1, STATE1 2680 movups STATE1, (OUTP) 2681 pxor IN2, STATE2 2682 movups STATE2, 0x10(OUTP) 2683 pxor IN3, STATE3 2684 movups STATE3, 0x20(OUTP) 2685 pxor IN4, STATE4 2686 movups STATE4, 0x30(OUTP) 2687 sub $64, LEN 2688 add $64, INP 2689 add $64, OUTP 2690 cmp $64, LEN 2691 jge .Lctr_enc_loop4 2692 cmp $16, LEN 2693 jb .Lctr_enc_ret 2694.align 4 2695.Lctr_enc_loop1: 2696 movaps IV, STATE 2697 call _aesni_inc 2698 movups (INP), IN 2699 call _aesni_enc1 2700 pxor IN, STATE 2701 movups STATE, (OUTP) 2702 sub $16, LEN 2703 add $16, INP 2704 add $16, OUTP 2705 cmp $16, LEN 2706 jge .Lctr_enc_loop1 2707.Lctr_enc_ret: 2708 movups IV, (IVP) 2709.Lctr_enc_just_ret: 2710 FRAME_END 2711 ret 2712ENDPROC(aesni_ctr_enc) 2713 2714/* 2715 * _aesni_gf128mul_x_ble: internal ABI 2716 * Multiply in GF(2^128) for XTS IVs 2717 * input: 2718 * IV: current IV 2719 * GF128MUL_MASK == mask with 0x87 and 0x01 2720 * output: 2721 * IV: next IV 2722 * changed: 2723 * CTR: == temporary value 2724 */ 2725#define _aesni_gf128mul_x_ble() \ 2726 pshufd $0x13, IV, CTR; \ 2727 paddq IV, IV; \ 2728 psrad $31, CTR; \ 2729 pand GF128MUL_MASK, CTR; \ 2730 pxor CTR, IV; 2731 2732/* 2733 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2734 * bool enc, u8 *iv) 2735 */ 2736ENTRY(aesni_xts_crypt8) 2737 FRAME_BEGIN 2738 cmpb $0, %cl 2739 movl $0, %ecx 2740 movl $240, %r10d 2741 leaq _aesni_enc4, %r11 2742 leaq _aesni_dec4, %rax 2743 cmovel %r10d, %ecx 2744 cmoveq %rax, %r11 2745 2746 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2747 movups (IVP), IV 2748 2749 mov 480(KEYP), KLEN 2750 addq %rcx, KEYP 2751 2752 movdqa IV, STATE1 2753 movdqu 0x00(INP), INC 2754 pxor INC, STATE1 2755 movdqu IV, 0x00(OUTP) 2756 2757 _aesni_gf128mul_x_ble() 2758 movdqa IV, STATE2 2759 movdqu 0x10(INP), INC 2760 pxor INC, STATE2 2761 movdqu IV, 0x10(OUTP) 2762 2763 _aesni_gf128mul_x_ble() 2764 movdqa IV, STATE3 2765 movdqu 0x20(INP), INC 2766 pxor INC, STATE3 2767 movdqu IV, 0x20(OUTP) 2768 2769 _aesni_gf128mul_x_ble() 2770 movdqa IV, STATE4 2771 movdqu 0x30(INP), INC 2772 pxor INC, STATE4 2773 movdqu IV, 0x30(OUTP) 2774 2775 CALL_NOSPEC %r11 2776 2777 movdqu 0x00(OUTP), INC 2778 pxor INC, STATE1 2779 movdqu STATE1, 0x00(OUTP) 2780 2781 _aesni_gf128mul_x_ble() 2782 movdqa IV, STATE1 2783 movdqu 0x40(INP), INC 2784 pxor INC, STATE1 2785 movdqu IV, 0x40(OUTP) 2786 2787 movdqu 0x10(OUTP), INC 2788 pxor INC, STATE2 2789 movdqu STATE2, 0x10(OUTP) 2790 2791 _aesni_gf128mul_x_ble() 2792 movdqa IV, STATE2 2793 movdqu 0x50(INP), INC 2794 pxor INC, STATE2 2795 movdqu IV, 0x50(OUTP) 2796 2797 movdqu 0x20(OUTP), INC 2798 pxor INC, STATE3 2799 movdqu STATE3, 0x20(OUTP) 2800 2801 _aesni_gf128mul_x_ble() 2802 movdqa IV, STATE3 2803 movdqu 0x60(INP), INC 2804 pxor INC, STATE3 2805 movdqu IV, 0x60(OUTP) 2806 2807 movdqu 0x30(OUTP), INC 2808 pxor INC, STATE4 2809 movdqu STATE4, 0x30(OUTP) 2810 2811 _aesni_gf128mul_x_ble() 2812 movdqa IV, STATE4 2813 movdqu 0x70(INP), INC 2814 pxor INC, STATE4 2815 movdqu IV, 0x70(OUTP) 2816 2817 _aesni_gf128mul_x_ble() 2818 movups IV, (IVP) 2819 2820 CALL_NOSPEC %r11 2821 2822 movdqu 0x40(OUTP), INC 2823 pxor INC, STATE1 2824 movdqu STATE1, 0x40(OUTP) 2825 2826 movdqu 0x50(OUTP), INC 2827 pxor INC, STATE2 2828 movdqu STATE2, 0x50(OUTP) 2829 2830 movdqu 0x60(OUTP), INC 2831 pxor INC, STATE3 2832 movdqu STATE3, 0x60(OUTP) 2833 2834 movdqu 0x70(OUTP), INC 2835 pxor INC, STATE4 2836 movdqu STATE4, 0x70(OUTP) 2837 2838 FRAME_END 2839 ret 2840ENDPROC(aesni_xts_crypt8) 2841 2842#endif 2843