1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Implement AES algorithm in Intel AES-NI instructions. 4 * 5 * The white paper of AES-NI instructions can be downloaded from: 6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7 * 8 * Copyright (C) 2008, Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal <vinodh.gopal@intel.com> 11 * Kahraman Akdemir 12 * 13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14 * interface for 64-bit kernels. 15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16 * Aidan O'Mahony (aidan.o.mahony@intel.com) 17 * Adrian Hoban <adrian.hoban@intel.com> 18 * James Guilford (james.guilford@intel.com) 19 * Gabriele Paoloni <gabriele.paoloni@intel.com> 20 * Tadeusz Struk (tadeusz.struk@intel.com) 21 * Wajdi Feghali (wajdi.k.feghali@intel.com) 22 * Copyright (c) 2010, Intel Corporation. 23 * 24 * Ported x86_64 version to x86: 25 * Author: Mathias Krause <minipli@googlemail.com> 26 */ 27 28#include <linux/linkage.h> 29#include <asm/inst.h> 30#include <asm/frame.h> 31#include <asm/nospec-branch.h> 32 33/* 34 * The following macros are used to move an (un)aligned 16 byte value to/from 35 * an XMM register. This can done for either FP or integer values, for FP use 36 * movaps (move aligned packed single) or integer use movdqa (move double quad 37 * aligned). It doesn't make a performance difference which instruction is used 38 * since Nehalem (original Core i7) was released. However, the movaps is a byte 39 * shorter, so that is the one we'll use for now. (same for unaligned). 40 */ 41#define MOVADQ movaps 42#define MOVUDQ movups 43 44#ifdef __x86_64__ 45 46# constants in mergeable sections, linker can reorder and merge 47.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 48.align 16 49.Lgf128mul_x_ble_mask: 50 .octa 0x00000000000000010000000000000087 51.section .rodata.cst16.POLY, "aM", @progbits, 16 52.align 16 53POLY: .octa 0xC2000000000000000000000000000001 54.section .rodata.cst16.TWOONE, "aM", @progbits, 16 55.align 16 56TWOONE: .octa 0x00000001000000000000000000000001 57 58.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 59.align 16 60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 61.section .rodata.cst16.MASK1, "aM", @progbits, 16 62.align 16 63MASK1: .octa 0x0000000000000000ffffffffffffffff 64.section .rodata.cst16.MASK2, "aM", @progbits, 16 65.align 16 66MASK2: .octa 0xffffffffffffffff0000000000000000 67.section .rodata.cst16.ONE, "aM", @progbits, 16 68.align 16 69ONE: .octa 0x00000000000000000000000000000001 70.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 71.align 16 72F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 73.section .rodata.cst16.dec, "aM", @progbits, 16 74.align 16 75dec: .octa 0x1 76.section .rodata.cst16.enc, "aM", @progbits, 16 77.align 16 78enc: .octa 0x2 79 80# order of these constants should not change. 81# more specifically, ALL_F should follow SHIFT_MASK, 82# and zero should follow ALL_F 83.section .rodata, "a", @progbits 84.align 16 85SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 86ALL_F: .octa 0xffffffffffffffffffffffffffffffff 87 .octa 0x00000000000000000000000000000000 88 89.text 90 91 92#define STACK_OFFSET 8*3 93 94#define AadHash 16*0 95#define AadLen 16*1 96#define InLen (16*1)+8 97#define PBlockEncKey 16*2 98#define OrigIV 16*3 99#define CurCount 16*4 100#define PBlockLen 16*5 101#define HashKey 16*6 // store HashKey <<1 mod poly here 102#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 103#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 104#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 105#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 106 // bits of HashKey <<1 mod poly here 107 //(for Karatsuba purposes) 108#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 109 // bits of HashKey^2 <<1 mod poly here 110 // (for Karatsuba purposes) 111#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 112 // bits of HashKey^3 <<1 mod poly here 113 // (for Karatsuba purposes) 114#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 115 // bits of HashKey^4 <<1 mod poly here 116 // (for Karatsuba purposes) 117 118#define arg1 rdi 119#define arg2 rsi 120#define arg3 rdx 121#define arg4 rcx 122#define arg5 r8 123#define arg6 r9 124#define arg7 STACK_OFFSET+8(%rsp) 125#define arg8 STACK_OFFSET+16(%rsp) 126#define arg9 STACK_OFFSET+24(%rsp) 127#define arg10 STACK_OFFSET+32(%rsp) 128#define arg11 STACK_OFFSET+40(%rsp) 129#define keysize 2*15*16(%arg1) 130#endif 131 132 133#define STATE1 %xmm0 134#define STATE2 %xmm4 135#define STATE3 %xmm5 136#define STATE4 %xmm6 137#define STATE STATE1 138#define IN1 %xmm1 139#define IN2 %xmm7 140#define IN3 %xmm8 141#define IN4 %xmm9 142#define IN IN1 143#define KEY %xmm2 144#define IV %xmm3 145 146#define BSWAP_MASK %xmm10 147#define CTR %xmm11 148#define INC %xmm12 149 150#define GF128MUL_MASK %xmm10 151 152#ifdef __x86_64__ 153#define AREG %rax 154#define KEYP %rdi 155#define OUTP %rsi 156#define UKEYP OUTP 157#define INP %rdx 158#define LEN %rcx 159#define IVP %r8 160#define KLEN %r9d 161#define T1 %r10 162#define TKEYP T1 163#define T2 %r11 164#define TCTR_LOW T2 165#else 166#define AREG %eax 167#define KEYP %edi 168#define OUTP AREG 169#define UKEYP OUTP 170#define INP %edx 171#define LEN %esi 172#define IVP %ebp 173#define KLEN %ebx 174#define T1 %ecx 175#define TKEYP T1 176#endif 177 178.macro FUNC_SAVE 179 push %r12 180 push %r13 181 push %r14 182# 183# states of %xmm registers %xmm6:%xmm15 not saved 184# all %xmm registers are clobbered 185# 186.endm 187 188 189.macro FUNC_RESTORE 190 pop %r14 191 pop %r13 192 pop %r12 193.endm 194 195# Precompute hashkeys. 196# Input: Hash subkey. 197# Output: HashKeys stored in gcm_context_data. Only needs to be called 198# once per key. 199# clobbers r12, and tmp xmm registers. 200.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 201 mov \SUBKEY, %r12 202 movdqu (%r12), \TMP3 203 movdqa SHUF_MASK(%rip), \TMP2 204 PSHUFB_XMM \TMP2, \TMP3 205 206 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 207 208 movdqa \TMP3, \TMP2 209 psllq $1, \TMP3 210 psrlq $63, \TMP2 211 movdqa \TMP2, \TMP1 212 pslldq $8, \TMP2 213 psrldq $8, \TMP1 214 por \TMP2, \TMP3 215 216 # reduce HashKey<<1 217 218 pshufd $0x24, \TMP1, \TMP2 219 pcmpeqd TWOONE(%rip), \TMP2 220 pand POLY(%rip), \TMP2 221 pxor \TMP2, \TMP3 222 movdqu \TMP3, HashKey(%arg2) 223 224 movdqa \TMP3, \TMP5 225 pshufd $78, \TMP3, \TMP1 226 pxor \TMP3, \TMP1 227 movdqu \TMP1, HashKey_k(%arg2) 228 229 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 230# TMP5 = HashKey^2<<1 (mod poly) 231 movdqu \TMP5, HashKey_2(%arg2) 232# HashKey_2 = HashKey^2<<1 (mod poly) 233 pshufd $78, \TMP5, \TMP1 234 pxor \TMP5, \TMP1 235 movdqu \TMP1, HashKey_2_k(%arg2) 236 237 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 238# TMP5 = HashKey^3<<1 (mod poly) 239 movdqu \TMP5, HashKey_3(%arg2) 240 pshufd $78, \TMP5, \TMP1 241 pxor \TMP5, \TMP1 242 movdqu \TMP1, HashKey_3_k(%arg2) 243 244 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 245# TMP5 = HashKey^3<<1 (mod poly) 246 movdqu \TMP5, HashKey_4(%arg2) 247 pshufd $78, \TMP5, \TMP1 248 pxor \TMP5, \TMP1 249 movdqu \TMP1, HashKey_4_k(%arg2) 250.endm 251 252# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 253# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 254.macro GCM_INIT Iv SUBKEY AAD AADLEN 255 mov \AADLEN, %r11 256 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 257 xor %r11d, %r11d 258 mov %r11, InLen(%arg2) # ctx_data.in_length = 0 259 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 260 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 261 mov \Iv, %rax 262 movdqu (%rax), %xmm0 263 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 264 265 movdqa SHUF_MASK(%rip), %xmm2 266 PSHUFB_XMM %xmm2, %xmm0 267 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 268 269 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 270 movdqu HashKey(%arg2), %xmm13 271 272 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 273 %xmm4, %xmm5, %xmm6 274.endm 275 276# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 277# struct has been initialized by GCM_INIT. 278# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 279# Clobbers rax, r10-r13, and xmm0-xmm15 280.macro GCM_ENC_DEC operation 281 movdqu AadHash(%arg2), %xmm8 282 movdqu HashKey(%arg2), %xmm13 283 add %arg5, InLen(%arg2) 284 285 xor %r11d, %r11d # initialise the data pointer offset as zero 286 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 287 288 sub %r11, %arg5 # sub partial block data used 289 mov %arg5, %r13 # save the number of bytes 290 291 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 292 mov %r13, %r12 293 # Encrypt/Decrypt first few blocks 294 295 and $(3<<4), %r12 296 jz _initial_num_blocks_is_0_\@ 297 cmp $(2<<4), %r12 298 jb _initial_num_blocks_is_1_\@ 299 je _initial_num_blocks_is_2_\@ 300_initial_num_blocks_is_3_\@: 301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 303 sub $48, %r13 304 jmp _initial_blocks_\@ 305_initial_num_blocks_is_2_\@: 306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 308 sub $32, %r13 309 jmp _initial_blocks_\@ 310_initial_num_blocks_is_1_\@: 311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 313 sub $16, %r13 314 jmp _initial_blocks_\@ 315_initial_num_blocks_is_0_\@: 316 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 317%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 318_initial_blocks_\@: 319 320 # Main loop - Encrypt/Decrypt remaining blocks 321 322 cmp $0, %r13 323 je _zero_cipher_left_\@ 324 sub $64, %r13 325 je _four_cipher_left_\@ 326_crypt_by_4_\@: 327 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 328 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 329 %xmm7, %xmm8, enc 330 add $64, %r11 331 sub $64, %r13 332 jne _crypt_by_4_\@ 333_four_cipher_left_\@: 334 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 335%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 336_zero_cipher_left_\@: 337 movdqu %xmm8, AadHash(%arg2) 338 movdqu %xmm0, CurCount(%arg2) 339 340 mov %arg5, %r13 341 and $15, %r13 # %r13 = arg5 (mod 16) 342 je _multiple_of_16_bytes_\@ 343 344 mov %r13, PBlockLen(%arg2) 345 346 # Handle the last <16 Byte block separately 347 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 348 movdqu %xmm0, CurCount(%arg2) 349 movdqa SHUF_MASK(%rip), %xmm10 350 PSHUFB_XMM %xmm10, %xmm0 351 352 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 353 movdqu %xmm0, PBlockEncKey(%arg2) 354 355 cmp $16, %arg5 356 jge _large_enough_update_\@ 357 358 lea (%arg4,%r11,1), %r10 359 mov %r13, %r12 360 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 361 jmp _data_read_\@ 362 363_large_enough_update_\@: 364 sub $16, %r11 365 add %r13, %r11 366 367 # receive the last <16 Byte block 368 movdqu (%arg4, %r11, 1), %xmm1 369 370 sub %r13, %r11 371 add $16, %r11 372 373 lea SHIFT_MASK+16(%rip), %r12 374 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 375 # (r13 is the number of bytes in plaintext mod 16) 376 sub %r13, %r12 377 # get the appropriate shuffle mask 378 movdqu (%r12), %xmm2 379 # shift right 16-r13 bytes 380 PSHUFB_XMM %xmm2, %xmm1 381 382_data_read_\@: 383 lea ALL_F+16(%rip), %r12 384 sub %r13, %r12 385 386.ifc \operation, dec 387 movdqa %xmm1, %xmm2 388.endif 389 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 390 movdqu (%r12), %xmm1 391 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 392 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 393.ifc \operation, dec 394 pand %xmm1, %xmm2 395 movdqa SHUF_MASK(%rip), %xmm10 396 PSHUFB_XMM %xmm10 ,%xmm2 397 398 pxor %xmm2, %xmm8 399.else 400 movdqa SHUF_MASK(%rip), %xmm10 401 PSHUFB_XMM %xmm10,%xmm0 402 403 pxor %xmm0, %xmm8 404.endif 405 406 movdqu %xmm8, AadHash(%arg2) 407.ifc \operation, enc 408 # GHASH computation for the last <16 byte block 409 movdqa SHUF_MASK(%rip), %xmm10 410 # shuffle xmm0 back to output as ciphertext 411 PSHUFB_XMM %xmm10, %xmm0 412.endif 413 414 # Output %r13 bytes 415 MOVQ_R64_XMM %xmm0, %rax 416 cmp $8, %r13 417 jle _less_than_8_bytes_left_\@ 418 mov %rax, (%arg3 , %r11, 1) 419 add $8, %r11 420 psrldq $8, %xmm0 421 MOVQ_R64_XMM %xmm0, %rax 422 sub $8, %r13 423_less_than_8_bytes_left_\@: 424 mov %al, (%arg3, %r11, 1) 425 add $1, %r11 426 shr $8, %rax 427 sub $1, %r13 428 jne _less_than_8_bytes_left_\@ 429_multiple_of_16_bytes_\@: 430.endm 431 432# GCM_COMPLETE Finishes update of tag of last partial block 433# Output: Authorization Tag (AUTH_TAG) 434# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 435.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 436 movdqu AadHash(%arg2), %xmm8 437 movdqu HashKey(%arg2), %xmm13 438 439 mov PBlockLen(%arg2), %r12 440 441 cmp $0, %r12 442 je _partial_done\@ 443 444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 445 446_partial_done\@: 447 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 448 shl $3, %r12 # convert into number of bits 449 movd %r12d, %xmm15 # len(A) in %xmm15 450 mov InLen(%arg2), %r12 451 shl $3, %r12 # len(C) in bits (*128) 452 MOVQ_R64_XMM %r12, %xmm1 453 454 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 455 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 456 pxor %xmm15, %xmm8 457 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 458 # final GHASH computation 459 movdqa SHUF_MASK(%rip), %xmm10 460 PSHUFB_XMM %xmm10, %xmm8 461 462 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 463 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 464 pxor %xmm8, %xmm0 465_return_T_\@: 466 mov \AUTHTAG, %r10 # %r10 = authTag 467 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 468 cmp $16, %r11 469 je _T_16_\@ 470 cmp $8, %r11 471 jl _T_4_\@ 472_T_8_\@: 473 MOVQ_R64_XMM %xmm0, %rax 474 mov %rax, (%r10) 475 add $8, %r10 476 sub $8, %r11 477 psrldq $8, %xmm0 478 cmp $0, %r11 479 je _return_T_done_\@ 480_T_4_\@: 481 movd %xmm0, %eax 482 mov %eax, (%r10) 483 add $4, %r10 484 sub $4, %r11 485 psrldq $4, %xmm0 486 cmp $0, %r11 487 je _return_T_done_\@ 488_T_123_\@: 489 movd %xmm0, %eax 490 cmp $2, %r11 491 jl _T_1_\@ 492 mov %ax, (%r10) 493 cmp $2, %r11 494 je _return_T_done_\@ 495 add $2, %r10 496 sar $16, %eax 497_T_1_\@: 498 mov %al, (%r10) 499 jmp _return_T_done_\@ 500_T_16_\@: 501 movdqu %xmm0, (%r10) 502_return_T_done_\@: 503.endm 504 505#ifdef __x86_64__ 506/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 507* 508* 509* Input: A and B (128-bits each, bit-reflected) 510* Output: C = A*B*x mod poly, (i.e. >>1 ) 511* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 512* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 513* 514*/ 515.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 516 movdqa \GH, \TMP1 517 pshufd $78, \GH, \TMP2 518 pshufd $78, \HK, \TMP3 519 pxor \GH, \TMP2 # TMP2 = a1+a0 520 pxor \HK, \TMP3 # TMP3 = b1+b0 521 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 522 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 523 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 524 pxor \GH, \TMP2 525 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 526 movdqa \TMP2, \TMP3 527 pslldq $8, \TMP3 # left shift TMP3 2 DWs 528 psrldq $8, \TMP2 # right shift TMP2 2 DWs 529 pxor \TMP3, \GH 530 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 531 532 # first phase of the reduction 533 534 movdqa \GH, \TMP2 535 movdqa \GH, \TMP3 536 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 537 # in in order to perform 538 # independent shifts 539 pslld $31, \TMP2 # packed right shift <<31 540 pslld $30, \TMP3 # packed right shift <<30 541 pslld $25, \TMP4 # packed right shift <<25 542 pxor \TMP3, \TMP2 # xor the shifted versions 543 pxor \TMP4, \TMP2 544 movdqa \TMP2, \TMP5 545 psrldq $4, \TMP5 # right shift TMP5 1 DW 546 pslldq $12, \TMP2 # left shift TMP2 3 DWs 547 pxor \TMP2, \GH 548 549 # second phase of the reduction 550 551 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 552 # in in order to perform 553 # independent shifts 554 movdqa \GH,\TMP3 555 movdqa \GH,\TMP4 556 psrld $1,\TMP2 # packed left shift >>1 557 psrld $2,\TMP3 # packed left shift >>2 558 psrld $7,\TMP4 # packed left shift >>7 559 pxor \TMP3,\TMP2 # xor the shifted versions 560 pxor \TMP4,\TMP2 561 pxor \TMP5, \TMP2 562 pxor \TMP2, \GH 563 pxor \TMP1, \GH # result is in TMP1 564.endm 565 566# Reads DLEN bytes starting at DPTR and stores in XMMDst 567# where 0 < DLEN < 16 568# Clobbers %rax, DLEN and XMM1 569.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 570 cmp $8, \DLEN 571 jl _read_lt8_\@ 572 mov (\DPTR), %rax 573 MOVQ_R64_XMM %rax, \XMMDst 574 sub $8, \DLEN 575 jz _done_read_partial_block_\@ 576 xor %eax, %eax 577_read_next_byte_\@: 578 shl $8, %rax 579 mov 7(\DPTR, \DLEN, 1), %al 580 dec \DLEN 581 jnz _read_next_byte_\@ 582 MOVQ_R64_XMM %rax, \XMM1 583 pslldq $8, \XMM1 584 por \XMM1, \XMMDst 585 jmp _done_read_partial_block_\@ 586_read_lt8_\@: 587 xor %eax, %eax 588_read_next_byte_lt8_\@: 589 shl $8, %rax 590 mov -1(\DPTR, \DLEN, 1), %al 591 dec \DLEN 592 jnz _read_next_byte_lt8_\@ 593 MOVQ_R64_XMM %rax, \XMMDst 594_done_read_partial_block_\@: 595.endm 596 597# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 598# clobbers r10-11, xmm14 599.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 600 TMP6 TMP7 601 MOVADQ SHUF_MASK(%rip), %xmm14 602 mov \AAD, %r10 # %r10 = AAD 603 mov \AADLEN, %r11 # %r11 = aadLen 604 pxor \TMP7, \TMP7 605 pxor \TMP6, \TMP6 606 607 cmp $16, %r11 608 jl _get_AAD_rest\@ 609_get_AAD_blocks\@: 610 movdqu (%r10), \TMP7 611 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data 612 pxor \TMP7, \TMP6 613 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 614 add $16, %r10 615 sub $16, %r11 616 cmp $16, %r11 617 jge _get_AAD_blocks\@ 618 619 movdqu \TMP6, \TMP7 620 621 /* read the last <16B of AAD */ 622_get_AAD_rest\@: 623 cmp $0, %r11 624 je _get_AAD_done\@ 625 626 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 627 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data 628 pxor \TMP6, \TMP7 629 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 630 movdqu \TMP7, \TMP6 631 632_get_AAD_done\@: 633 movdqu \TMP6, AadHash(%arg2) 634.endm 635 636# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 637# between update calls. 638# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 639# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 640# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 641.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 642 AAD_HASH operation 643 mov PBlockLen(%arg2), %r13 644 cmp $0, %r13 645 je _partial_block_done_\@ # Leave Macro if no partial blocks 646 # Read in input data without over reading 647 cmp $16, \PLAIN_CYPH_LEN 648 jl _fewer_than_16_bytes_\@ 649 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 650 jmp _data_read_\@ 651 652_fewer_than_16_bytes_\@: 653 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 654 mov \PLAIN_CYPH_LEN, %r12 655 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 656 657 mov PBlockLen(%arg2), %r13 658 659_data_read_\@: # Finished reading in data 660 661 movdqu PBlockEncKey(%arg2), %xmm9 662 movdqu HashKey(%arg2), %xmm13 663 664 lea SHIFT_MASK(%rip), %r12 665 666 # adjust the shuffle mask pointer to be able to shift r13 bytes 667 # r16-r13 is the number of bytes in plaintext mod 16) 668 add %r13, %r12 669 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 670 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes 671 672.ifc \operation, dec 673 movdqa %xmm1, %xmm3 674 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 675 676 mov \PLAIN_CYPH_LEN, %r10 677 add %r13, %r10 678 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 679 sub $16, %r10 680 # Determine if if partial block is not being filled and 681 # shift mask accordingly 682 jge _no_extra_mask_1_\@ 683 sub %r10, %r12 684_no_extra_mask_1_\@: 685 686 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 687 # get the appropriate mask to mask out bottom r13 bytes of xmm9 688 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 689 690 pand %xmm1, %xmm3 691 movdqa SHUF_MASK(%rip), %xmm10 692 PSHUFB_XMM %xmm10, %xmm3 693 PSHUFB_XMM %xmm2, %xmm3 694 pxor %xmm3, \AAD_HASH 695 696 cmp $0, %r10 697 jl _partial_incomplete_1_\@ 698 699 # GHASH computation for the last <16 Byte block 700 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 701 xor %eax, %eax 702 703 mov %rax, PBlockLen(%arg2) 704 jmp _dec_done_\@ 705_partial_incomplete_1_\@: 706 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 707_dec_done_\@: 708 movdqu \AAD_HASH, AadHash(%arg2) 709.else 710 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 711 712 mov \PLAIN_CYPH_LEN, %r10 713 add %r13, %r10 714 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 715 sub $16, %r10 716 # Determine if if partial block is not being filled and 717 # shift mask accordingly 718 jge _no_extra_mask_2_\@ 719 sub %r10, %r12 720_no_extra_mask_2_\@: 721 722 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 723 # get the appropriate mask to mask out bottom r13 bytes of xmm9 724 pand %xmm1, %xmm9 725 726 movdqa SHUF_MASK(%rip), %xmm1 727 PSHUFB_XMM %xmm1, %xmm9 728 PSHUFB_XMM %xmm2, %xmm9 729 pxor %xmm9, \AAD_HASH 730 731 cmp $0, %r10 732 jl _partial_incomplete_2_\@ 733 734 # GHASH computation for the last <16 Byte block 735 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 736 xor %eax, %eax 737 738 mov %rax, PBlockLen(%arg2) 739 jmp _encode_done_\@ 740_partial_incomplete_2_\@: 741 add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 742_encode_done_\@: 743 movdqu \AAD_HASH, AadHash(%arg2) 744 745 movdqa SHUF_MASK(%rip), %xmm10 746 # shuffle xmm9 back to output as ciphertext 747 PSHUFB_XMM %xmm10, %xmm9 748 PSHUFB_XMM %xmm2, %xmm9 749.endif 750 # output encrypted Bytes 751 cmp $0, %r10 752 jl _partial_fill_\@ 753 mov %r13, %r12 754 mov $16, %r13 755 # Set r13 to be the number of bytes to write out 756 sub %r12, %r13 757 jmp _count_set_\@ 758_partial_fill_\@: 759 mov \PLAIN_CYPH_LEN, %r13 760_count_set_\@: 761 movdqa %xmm9, %xmm0 762 MOVQ_R64_XMM %xmm0, %rax 763 cmp $8, %r13 764 jle _less_than_8_bytes_left_\@ 765 766 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 767 add $8, \DATA_OFFSET 768 psrldq $8, %xmm0 769 MOVQ_R64_XMM %xmm0, %rax 770 sub $8, %r13 771_less_than_8_bytes_left_\@: 772 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 773 add $1, \DATA_OFFSET 774 shr $8, %rax 775 sub $1, %r13 776 jne _less_than_8_bytes_left_\@ 777_partial_block_done_\@: 778.endm # PARTIAL_BLOCK 779 780/* 781* if a = number of total plaintext bytes 782* b = floor(a/16) 783* num_initial_blocks = b mod 4 784* encrypt the initial num_initial_blocks blocks and apply ghash on 785* the ciphertext 786* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 787* are clobbered 788* arg1, %arg2, %arg3 are used as a pointer only, not modified 789*/ 790 791 792.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 793 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 794 MOVADQ SHUF_MASK(%rip), %xmm14 795 796 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 797 798 # start AES for num_initial_blocks blocks 799 800 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 801 802.if (\i == 5) || (\i == 6) || (\i == 7) 803 804 MOVADQ ONE(%RIP),\TMP1 805 MOVADQ 0(%arg1),\TMP2 806.irpc index, \i_seq 807 paddd \TMP1, \XMM0 # INCR Y0 808.ifc \operation, dec 809 movdqa \XMM0, %xmm\index 810.else 811 MOVADQ \XMM0, %xmm\index 812.endif 813 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 814 pxor \TMP2, %xmm\index 815.endr 816 lea 0x10(%arg1),%r10 817 mov keysize,%eax 818 shr $2,%eax # 128->4, 192->6, 256->8 819 add $5,%eax # 128->9, 192->11, 256->13 820 821aes_loop_initial_\@: 822 MOVADQ (%r10),\TMP1 823.irpc index, \i_seq 824 AESENC \TMP1, %xmm\index 825.endr 826 add $16,%r10 827 sub $1,%eax 828 jnz aes_loop_initial_\@ 829 830 MOVADQ (%r10), \TMP1 831.irpc index, \i_seq 832 AESENCLAST \TMP1, %xmm\index # Last Round 833.endr 834.irpc index, \i_seq 835 movdqu (%arg4 , %r11, 1), \TMP1 836 pxor \TMP1, %xmm\index 837 movdqu %xmm\index, (%arg3 , %r11, 1) 838 # write back plaintext/ciphertext for num_initial_blocks 839 add $16, %r11 840 841.ifc \operation, dec 842 movdqa \TMP1, %xmm\index 843.endif 844 PSHUFB_XMM %xmm14, %xmm\index 845 846 # prepare plaintext/ciphertext for GHASH computation 847.endr 848.endif 849 850 # apply GHASH on num_initial_blocks blocks 851 852.if \i == 5 853 pxor %xmm5, %xmm6 854 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 855 pxor %xmm6, %xmm7 856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 857 pxor %xmm7, %xmm8 858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 859.elseif \i == 6 860 pxor %xmm6, %xmm7 861 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 862 pxor %xmm7, %xmm8 863 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 864.elseif \i == 7 865 pxor %xmm7, %xmm8 866 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 867.endif 868 cmp $64, %r13 869 jl _initial_blocks_done\@ 870 # no need for precomputed values 871/* 872* 873* Precomputations for HashKey parallel with encryption of first 4 blocks. 874* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 875*/ 876 MOVADQ ONE(%RIP),\TMP1 877 paddd \TMP1, \XMM0 # INCR Y0 878 MOVADQ \XMM0, \XMM1 879 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 880 881 paddd \TMP1, \XMM0 # INCR Y0 882 MOVADQ \XMM0, \XMM2 883 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 884 885 paddd \TMP1, \XMM0 # INCR Y0 886 MOVADQ \XMM0, \XMM3 887 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 888 889 paddd \TMP1, \XMM0 # INCR Y0 890 MOVADQ \XMM0, \XMM4 891 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 892 893 MOVADQ 0(%arg1),\TMP1 894 pxor \TMP1, \XMM1 895 pxor \TMP1, \XMM2 896 pxor \TMP1, \XMM3 897 pxor \TMP1, \XMM4 898.irpc index, 1234 # do 4 rounds 899 movaps 0x10*\index(%arg1), \TMP1 900 AESENC \TMP1, \XMM1 901 AESENC \TMP1, \XMM2 902 AESENC \TMP1, \XMM3 903 AESENC \TMP1, \XMM4 904.endr 905.irpc index, 56789 # do next 5 rounds 906 movaps 0x10*\index(%arg1), \TMP1 907 AESENC \TMP1, \XMM1 908 AESENC \TMP1, \XMM2 909 AESENC \TMP1, \XMM3 910 AESENC \TMP1, \XMM4 911.endr 912 lea 0xa0(%arg1),%r10 913 mov keysize,%eax 914 shr $2,%eax # 128->4, 192->6, 256->8 915 sub $4,%eax # 128->0, 192->2, 256->4 916 jz aes_loop_pre_done\@ 917 918aes_loop_pre_\@: 919 MOVADQ (%r10),\TMP2 920.irpc index, 1234 921 AESENC \TMP2, %xmm\index 922.endr 923 add $16,%r10 924 sub $1,%eax 925 jnz aes_loop_pre_\@ 926 927aes_loop_pre_done\@: 928 MOVADQ (%r10), \TMP2 929 AESENCLAST \TMP2, \XMM1 930 AESENCLAST \TMP2, \XMM2 931 AESENCLAST \TMP2, \XMM3 932 AESENCLAST \TMP2, \XMM4 933 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 934 pxor \TMP1, \XMM1 935.ifc \operation, dec 936 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 937 movdqa \TMP1, \XMM1 938.endif 939 movdqu 16*1(%arg4 , %r11 , 1), \TMP1 940 pxor \TMP1, \XMM2 941.ifc \operation, dec 942 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 943 movdqa \TMP1, \XMM2 944.endif 945 movdqu 16*2(%arg4 , %r11 , 1), \TMP1 946 pxor \TMP1, \XMM3 947.ifc \operation, dec 948 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 949 movdqa \TMP1, \XMM3 950.endif 951 movdqu 16*3(%arg4 , %r11 , 1), \TMP1 952 pxor \TMP1, \XMM4 953.ifc \operation, dec 954 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 955 movdqa \TMP1, \XMM4 956.else 957 movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 958 movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 959 movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 960 movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 961.endif 962 963 add $64, %r11 964 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 965 pxor \XMMDst, \XMM1 966# combine GHASHed value with the corresponding ciphertext 967 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 968 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 969 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 970 971_initial_blocks_done\@: 972 973.endm 974 975/* 976* encrypt 4 blocks at a time 977* ghash the 4 previously encrypted ciphertext blocks 978* arg1, %arg3, %arg4 are used as pointers only, not modified 979* %r11 is the data offset value 980*/ 981.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 982TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 983 984 movdqa \XMM1, \XMM5 985 movdqa \XMM2, \XMM6 986 movdqa \XMM3, \XMM7 987 movdqa \XMM4, \XMM8 988 989 movdqa SHUF_MASK(%rip), %xmm15 990 # multiply TMP5 * HashKey using karatsuba 991 992 movdqa \XMM5, \TMP4 993 pshufd $78, \XMM5, \TMP6 994 pxor \XMM5, \TMP6 995 paddd ONE(%rip), \XMM0 # INCR CNT 996 movdqu HashKey_4(%arg2), \TMP5 997 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 998 movdqa \XMM0, \XMM1 999 paddd ONE(%rip), \XMM0 # INCR CNT 1000 movdqa \XMM0, \XMM2 1001 paddd ONE(%rip), \XMM0 # INCR CNT 1002 movdqa \XMM0, \XMM3 1003 paddd ONE(%rip), \XMM0 # INCR CNT 1004 movdqa \XMM0, \XMM4 1005 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1006 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1010 1011 pxor (%arg1), \XMM1 1012 pxor (%arg1), \XMM2 1013 pxor (%arg1), \XMM3 1014 pxor (%arg1), \XMM4 1015 movdqu HashKey_4_k(%arg2), \TMP5 1016 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1017 movaps 0x10(%arg1), \TMP1 1018 AESENC \TMP1, \XMM1 # Round 1 1019 AESENC \TMP1, \XMM2 1020 AESENC \TMP1, \XMM3 1021 AESENC \TMP1, \XMM4 1022 movaps 0x20(%arg1), \TMP1 1023 AESENC \TMP1, \XMM1 # Round 2 1024 AESENC \TMP1, \XMM2 1025 AESENC \TMP1, \XMM3 1026 AESENC \TMP1, \XMM4 1027 movdqa \XMM6, \TMP1 1028 pshufd $78, \XMM6, \TMP2 1029 pxor \XMM6, \TMP2 1030 movdqu HashKey_3(%arg2), \TMP5 1031 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1032 movaps 0x30(%arg1), \TMP3 1033 AESENC \TMP3, \XMM1 # Round 3 1034 AESENC \TMP3, \XMM2 1035 AESENC \TMP3, \XMM3 1036 AESENC \TMP3, \XMM4 1037 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1038 movaps 0x40(%arg1), \TMP3 1039 AESENC \TMP3, \XMM1 # Round 4 1040 AESENC \TMP3, \XMM2 1041 AESENC \TMP3, \XMM3 1042 AESENC \TMP3, \XMM4 1043 movdqu HashKey_3_k(%arg2), \TMP5 1044 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1045 movaps 0x50(%arg1), \TMP3 1046 AESENC \TMP3, \XMM1 # Round 5 1047 AESENC \TMP3, \XMM2 1048 AESENC \TMP3, \XMM3 1049 AESENC \TMP3, \XMM4 1050 pxor \TMP1, \TMP4 1051# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1052 pxor \XMM6, \XMM5 1053 pxor \TMP2, \TMP6 1054 movdqa \XMM7, \TMP1 1055 pshufd $78, \XMM7, \TMP2 1056 pxor \XMM7, \TMP2 1057 movdqu HashKey_2(%arg2), \TMP5 1058 1059 # Multiply TMP5 * HashKey using karatsuba 1060 1061 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1062 movaps 0x60(%arg1), \TMP3 1063 AESENC \TMP3, \XMM1 # Round 6 1064 AESENC \TMP3, \XMM2 1065 AESENC \TMP3, \XMM3 1066 AESENC \TMP3, \XMM4 1067 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1068 movaps 0x70(%arg1), \TMP3 1069 AESENC \TMP3, \XMM1 # Round 7 1070 AESENC \TMP3, \XMM2 1071 AESENC \TMP3, \XMM3 1072 AESENC \TMP3, \XMM4 1073 movdqu HashKey_2_k(%arg2), \TMP5 1074 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1075 movaps 0x80(%arg1), \TMP3 1076 AESENC \TMP3, \XMM1 # Round 8 1077 AESENC \TMP3, \XMM2 1078 AESENC \TMP3, \XMM3 1079 AESENC \TMP3, \XMM4 1080 pxor \TMP1, \TMP4 1081# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1082 pxor \XMM7, \XMM5 1083 pxor \TMP2, \TMP6 1084 1085 # Multiply XMM8 * HashKey 1086 # XMM8 and TMP5 hold the values for the two operands 1087 1088 movdqa \XMM8, \TMP1 1089 pshufd $78, \XMM8, \TMP2 1090 pxor \XMM8, \TMP2 1091 movdqu HashKey(%arg2), \TMP5 1092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1093 movaps 0x90(%arg1), \TMP3 1094 AESENC \TMP3, \XMM1 # Round 9 1095 AESENC \TMP3, \XMM2 1096 AESENC \TMP3, \XMM3 1097 AESENC \TMP3, \XMM4 1098 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1099 lea 0xa0(%arg1),%r10 1100 mov keysize,%eax 1101 shr $2,%eax # 128->4, 192->6, 256->8 1102 sub $4,%eax # 128->0, 192->2, 256->4 1103 jz aes_loop_par_enc_done\@ 1104 1105aes_loop_par_enc\@: 1106 MOVADQ (%r10),\TMP3 1107.irpc index, 1234 1108 AESENC \TMP3, %xmm\index 1109.endr 1110 add $16,%r10 1111 sub $1,%eax 1112 jnz aes_loop_par_enc\@ 1113 1114aes_loop_par_enc_done\@: 1115 MOVADQ (%r10), \TMP3 1116 AESENCLAST \TMP3, \XMM1 # Round 10 1117 AESENCLAST \TMP3, \XMM2 1118 AESENCLAST \TMP3, \XMM3 1119 AESENCLAST \TMP3, \XMM4 1120 movdqu HashKey_k(%arg2), \TMP5 1121 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1122 movdqu (%arg4,%r11,1), \TMP3 1123 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1124 movdqu 16(%arg4,%r11,1), \TMP3 1125 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1126 movdqu 32(%arg4,%r11,1), \TMP3 1127 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1128 movdqu 48(%arg4,%r11,1), \TMP3 1129 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1130 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1131 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1132 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1133 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1134 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1135 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1136 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1137 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1138 1139 pxor \TMP4, \TMP1 1140 pxor \XMM8, \XMM5 1141 pxor \TMP6, \TMP2 1142 pxor \TMP1, \TMP2 1143 pxor \XMM5, \TMP2 1144 movdqa \TMP2, \TMP3 1145 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1146 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1147 pxor \TMP3, \XMM5 1148 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1149 1150 # first phase of reduction 1151 1152 movdqa \XMM5, \TMP2 1153 movdqa \XMM5, \TMP3 1154 movdqa \XMM5, \TMP4 1155# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1156 pslld $31, \TMP2 # packed right shift << 31 1157 pslld $30, \TMP3 # packed right shift << 30 1158 pslld $25, \TMP4 # packed right shift << 25 1159 pxor \TMP3, \TMP2 # xor the shifted versions 1160 pxor \TMP4, \TMP2 1161 movdqa \TMP2, \TMP5 1162 psrldq $4, \TMP5 # right shift T5 1 DW 1163 pslldq $12, \TMP2 # left shift T2 3 DWs 1164 pxor \TMP2, \XMM5 1165 1166 # second phase of reduction 1167 1168 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1169 movdqa \XMM5,\TMP3 1170 movdqa \XMM5,\TMP4 1171 psrld $1, \TMP2 # packed left shift >>1 1172 psrld $2, \TMP3 # packed left shift >>2 1173 psrld $7, \TMP4 # packed left shift >>7 1174 pxor \TMP3,\TMP2 # xor the shifted versions 1175 pxor \TMP4,\TMP2 1176 pxor \TMP5, \TMP2 1177 pxor \TMP2, \XMM5 1178 pxor \TMP1, \XMM5 # result is in TMP1 1179 1180 pxor \XMM5, \XMM1 1181.endm 1182 1183/* 1184* decrypt 4 blocks at a time 1185* ghash the 4 previously decrypted ciphertext blocks 1186* arg1, %arg3, %arg4 are used as pointers only, not modified 1187* %r11 is the data offset value 1188*/ 1189.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 1190TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1191 1192 movdqa \XMM1, \XMM5 1193 movdqa \XMM2, \XMM6 1194 movdqa \XMM3, \XMM7 1195 movdqa \XMM4, \XMM8 1196 1197 movdqa SHUF_MASK(%rip), %xmm15 1198 # multiply TMP5 * HashKey using karatsuba 1199 1200 movdqa \XMM5, \TMP4 1201 pshufd $78, \XMM5, \TMP6 1202 pxor \XMM5, \TMP6 1203 paddd ONE(%rip), \XMM0 # INCR CNT 1204 movdqu HashKey_4(%arg2), \TMP5 1205 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1206 movdqa \XMM0, \XMM1 1207 paddd ONE(%rip), \XMM0 # INCR CNT 1208 movdqa \XMM0, \XMM2 1209 paddd ONE(%rip), \XMM0 # INCR CNT 1210 movdqa \XMM0, \XMM3 1211 paddd ONE(%rip), \XMM0 # INCR CNT 1212 movdqa \XMM0, \XMM4 1213 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1214 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1215 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1216 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1217 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1218 1219 pxor (%arg1), \XMM1 1220 pxor (%arg1), \XMM2 1221 pxor (%arg1), \XMM3 1222 pxor (%arg1), \XMM4 1223 movdqu HashKey_4_k(%arg2), \TMP5 1224 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1225 movaps 0x10(%arg1), \TMP1 1226 AESENC \TMP1, \XMM1 # Round 1 1227 AESENC \TMP1, \XMM2 1228 AESENC \TMP1, \XMM3 1229 AESENC \TMP1, \XMM4 1230 movaps 0x20(%arg1), \TMP1 1231 AESENC \TMP1, \XMM1 # Round 2 1232 AESENC \TMP1, \XMM2 1233 AESENC \TMP1, \XMM3 1234 AESENC \TMP1, \XMM4 1235 movdqa \XMM6, \TMP1 1236 pshufd $78, \XMM6, \TMP2 1237 pxor \XMM6, \TMP2 1238 movdqu HashKey_3(%arg2), \TMP5 1239 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1240 movaps 0x30(%arg1), \TMP3 1241 AESENC \TMP3, \XMM1 # Round 3 1242 AESENC \TMP3, \XMM2 1243 AESENC \TMP3, \XMM3 1244 AESENC \TMP3, \XMM4 1245 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1246 movaps 0x40(%arg1), \TMP3 1247 AESENC \TMP3, \XMM1 # Round 4 1248 AESENC \TMP3, \XMM2 1249 AESENC \TMP3, \XMM3 1250 AESENC \TMP3, \XMM4 1251 movdqu HashKey_3_k(%arg2), \TMP5 1252 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1253 movaps 0x50(%arg1), \TMP3 1254 AESENC \TMP3, \XMM1 # Round 5 1255 AESENC \TMP3, \XMM2 1256 AESENC \TMP3, \XMM3 1257 AESENC \TMP3, \XMM4 1258 pxor \TMP1, \TMP4 1259# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1260 pxor \XMM6, \XMM5 1261 pxor \TMP2, \TMP6 1262 movdqa \XMM7, \TMP1 1263 pshufd $78, \XMM7, \TMP2 1264 pxor \XMM7, \TMP2 1265 movdqu HashKey_2(%arg2), \TMP5 1266 1267 # Multiply TMP5 * HashKey using karatsuba 1268 1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1270 movaps 0x60(%arg1), \TMP3 1271 AESENC \TMP3, \XMM1 # Round 6 1272 AESENC \TMP3, \XMM2 1273 AESENC \TMP3, \XMM3 1274 AESENC \TMP3, \XMM4 1275 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1276 movaps 0x70(%arg1), \TMP3 1277 AESENC \TMP3, \XMM1 # Round 7 1278 AESENC \TMP3, \XMM2 1279 AESENC \TMP3, \XMM3 1280 AESENC \TMP3, \XMM4 1281 movdqu HashKey_2_k(%arg2), \TMP5 1282 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1283 movaps 0x80(%arg1), \TMP3 1284 AESENC \TMP3, \XMM1 # Round 8 1285 AESENC \TMP3, \XMM2 1286 AESENC \TMP3, \XMM3 1287 AESENC \TMP3, \XMM4 1288 pxor \TMP1, \TMP4 1289# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1290 pxor \XMM7, \XMM5 1291 pxor \TMP2, \TMP6 1292 1293 # Multiply XMM8 * HashKey 1294 # XMM8 and TMP5 hold the values for the two operands 1295 1296 movdqa \XMM8, \TMP1 1297 pshufd $78, \XMM8, \TMP2 1298 pxor \XMM8, \TMP2 1299 movdqu HashKey(%arg2), \TMP5 1300 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1301 movaps 0x90(%arg1), \TMP3 1302 AESENC \TMP3, \XMM1 # Round 9 1303 AESENC \TMP3, \XMM2 1304 AESENC \TMP3, \XMM3 1305 AESENC \TMP3, \XMM4 1306 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1307 lea 0xa0(%arg1),%r10 1308 mov keysize,%eax 1309 shr $2,%eax # 128->4, 192->6, 256->8 1310 sub $4,%eax # 128->0, 192->2, 256->4 1311 jz aes_loop_par_dec_done\@ 1312 1313aes_loop_par_dec\@: 1314 MOVADQ (%r10),\TMP3 1315.irpc index, 1234 1316 AESENC \TMP3, %xmm\index 1317.endr 1318 add $16,%r10 1319 sub $1,%eax 1320 jnz aes_loop_par_dec\@ 1321 1322aes_loop_par_dec_done\@: 1323 MOVADQ (%r10), \TMP3 1324 AESENCLAST \TMP3, \XMM1 # last round 1325 AESENCLAST \TMP3, \XMM2 1326 AESENCLAST \TMP3, \XMM3 1327 AESENCLAST \TMP3, \XMM4 1328 movdqu HashKey_k(%arg2), \TMP5 1329 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1330 movdqu (%arg4,%r11,1), \TMP3 1331 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1332 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1333 movdqa \TMP3, \XMM1 1334 movdqu 16(%arg4,%r11,1), \TMP3 1335 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1336 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1337 movdqa \TMP3, \XMM2 1338 movdqu 32(%arg4,%r11,1), \TMP3 1339 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1340 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1341 movdqa \TMP3, \XMM3 1342 movdqu 48(%arg4,%r11,1), \TMP3 1343 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1344 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1345 movdqa \TMP3, \XMM4 1346 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1347 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1348 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1349 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1350 1351 pxor \TMP4, \TMP1 1352 pxor \XMM8, \XMM5 1353 pxor \TMP6, \TMP2 1354 pxor \TMP1, \TMP2 1355 pxor \XMM5, \TMP2 1356 movdqa \TMP2, \TMP3 1357 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1358 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1359 pxor \TMP3, \XMM5 1360 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1361 1362 # first phase of reduction 1363 1364 movdqa \XMM5, \TMP2 1365 movdqa \XMM5, \TMP3 1366 movdqa \XMM5, \TMP4 1367# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1368 pslld $31, \TMP2 # packed right shift << 31 1369 pslld $30, \TMP3 # packed right shift << 30 1370 pslld $25, \TMP4 # packed right shift << 25 1371 pxor \TMP3, \TMP2 # xor the shifted versions 1372 pxor \TMP4, \TMP2 1373 movdqa \TMP2, \TMP5 1374 psrldq $4, \TMP5 # right shift T5 1 DW 1375 pslldq $12, \TMP2 # left shift T2 3 DWs 1376 pxor \TMP2, \XMM5 1377 1378 # second phase of reduction 1379 1380 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1381 movdqa \XMM5,\TMP3 1382 movdqa \XMM5,\TMP4 1383 psrld $1, \TMP2 # packed left shift >>1 1384 psrld $2, \TMP3 # packed left shift >>2 1385 psrld $7, \TMP4 # packed left shift >>7 1386 pxor \TMP3,\TMP2 # xor the shifted versions 1387 pxor \TMP4,\TMP2 1388 pxor \TMP5, \TMP2 1389 pxor \TMP2, \XMM5 1390 pxor \TMP1, \XMM5 # result is in TMP1 1391 1392 pxor \XMM5, \XMM1 1393.endm 1394 1395/* GHASH the last 4 ciphertext blocks. */ 1396.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1397TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1398 1399 # Multiply TMP6 * HashKey (using Karatsuba) 1400 1401 movdqa \XMM1, \TMP6 1402 pshufd $78, \XMM1, \TMP2 1403 pxor \XMM1, \TMP2 1404 movdqu HashKey_4(%arg2), \TMP5 1405 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1406 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1407 movdqu HashKey_4_k(%arg2), \TMP4 1408 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1409 movdqa \XMM1, \XMMDst 1410 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1411 1412 # Multiply TMP1 * HashKey (using Karatsuba) 1413 1414 movdqa \XMM2, \TMP1 1415 pshufd $78, \XMM2, \TMP2 1416 pxor \XMM2, \TMP2 1417 movdqu HashKey_3(%arg2), \TMP5 1418 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1419 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1420 movdqu HashKey_3_k(%arg2), \TMP4 1421 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1422 pxor \TMP1, \TMP6 1423 pxor \XMM2, \XMMDst 1424 pxor \TMP2, \XMM1 1425# results accumulated in TMP6, XMMDst, XMM1 1426 1427 # Multiply TMP1 * HashKey (using Karatsuba) 1428 1429 movdqa \XMM3, \TMP1 1430 pshufd $78, \XMM3, \TMP2 1431 pxor \XMM3, \TMP2 1432 movdqu HashKey_2(%arg2), \TMP5 1433 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1434 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1435 movdqu HashKey_2_k(%arg2), \TMP4 1436 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1437 pxor \TMP1, \TMP6 1438 pxor \XMM3, \XMMDst 1439 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1440 1441 # Multiply TMP1 * HashKey (using Karatsuba) 1442 movdqa \XMM4, \TMP1 1443 pshufd $78, \XMM4, \TMP2 1444 pxor \XMM4, \TMP2 1445 movdqu HashKey(%arg2), \TMP5 1446 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1447 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1448 movdqu HashKey_k(%arg2), \TMP4 1449 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1450 pxor \TMP1, \TMP6 1451 pxor \XMM4, \XMMDst 1452 pxor \XMM1, \TMP2 1453 pxor \TMP6, \TMP2 1454 pxor \XMMDst, \TMP2 1455 # middle section of the temp results combined as in karatsuba algorithm 1456 movdqa \TMP2, \TMP4 1457 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1458 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1459 pxor \TMP4, \XMMDst 1460 pxor \TMP2, \TMP6 1461# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1462 # first phase of the reduction 1463 movdqa \XMMDst, \TMP2 1464 movdqa \XMMDst, \TMP3 1465 movdqa \XMMDst, \TMP4 1466# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1467 pslld $31, \TMP2 # packed right shifting << 31 1468 pslld $30, \TMP3 # packed right shifting << 30 1469 pslld $25, \TMP4 # packed right shifting << 25 1470 pxor \TMP3, \TMP2 # xor the shifted versions 1471 pxor \TMP4, \TMP2 1472 movdqa \TMP2, \TMP7 1473 psrldq $4, \TMP7 # right shift TMP7 1 DW 1474 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1475 pxor \TMP2, \XMMDst 1476 1477 # second phase of the reduction 1478 movdqa \XMMDst, \TMP2 1479 # make 3 copies of XMMDst for doing 3 shift operations 1480 movdqa \XMMDst, \TMP3 1481 movdqa \XMMDst, \TMP4 1482 psrld $1, \TMP2 # packed left shift >> 1 1483 psrld $2, \TMP3 # packed left shift >> 2 1484 psrld $7, \TMP4 # packed left shift >> 7 1485 pxor \TMP3, \TMP2 # xor the shifted versions 1486 pxor \TMP4, \TMP2 1487 pxor \TMP7, \TMP2 1488 pxor \TMP2, \XMMDst 1489 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1490.endm 1491 1492 1493/* Encryption of a single block 1494* uses eax & r10 1495*/ 1496 1497.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1498 1499 pxor (%arg1), \XMM0 1500 mov keysize,%eax 1501 shr $2,%eax # 128->4, 192->6, 256->8 1502 add $5,%eax # 128->9, 192->11, 256->13 1503 lea 16(%arg1), %r10 # get first expanded key address 1504 1505_esb_loop_\@: 1506 MOVADQ (%r10),\TMP1 1507 AESENC \TMP1,\XMM0 1508 add $16,%r10 1509 sub $1,%eax 1510 jnz _esb_loop_\@ 1511 1512 MOVADQ (%r10),\TMP1 1513 AESENCLAST \TMP1,\XMM0 1514.endm 1515/***************************************************************************** 1516* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1517* struct gcm_context_data *data 1518* // Context data 1519* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1520* const u8 *in, // Ciphertext input 1521* u64 plaintext_len, // Length of data in bytes for decryption. 1522* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1523* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1524* // concatenated with 0x00000001. 16-byte aligned pointer. 1525* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1526* const u8 *aad, // Additional Authentication Data (AAD) 1527* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1528* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1529* // given authentication tag and only return the plaintext if they match. 1530* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1531* // (most likely), 12 or 8. 1532* 1533* Assumptions: 1534* 1535* keys: 1536* keys are pre-expanded and aligned to 16 bytes. we are using the first 1537* set of 11 keys in the data structure void *aes_ctx 1538* 1539* iv: 1540* 0 1 2 3 1541* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1542* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1543* | Salt (From the SA) | 1544* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1545* | Initialization Vector | 1546* | (This is the sequence number from IPSec header) | 1547* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1548* | 0x1 | 1549* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1550* 1551* 1552* 1553* AAD: 1554* AAD padded to 128 bits with 0 1555* for example, assume AAD is a u32 vector 1556* 1557* if AAD is 8 bytes: 1558* AAD[3] = {A0, A1}; 1559* padded AAD in xmm register = {A1 A0 0 0} 1560* 1561* 0 1 2 3 1562* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1563* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1564* | SPI (A1) | 1565* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1566* | 32-bit Sequence Number (A0) | 1567* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1568* | 0x0 | 1569* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1570* 1571* AAD Format with 32-bit Sequence Number 1572* 1573* if AAD is 12 bytes: 1574* AAD[3] = {A0, A1, A2}; 1575* padded AAD in xmm register = {A2 A1 A0 0} 1576* 1577* 0 1 2 3 1578* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1579* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1580* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1581* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1582* | SPI (A2) | 1583* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1584* | 64-bit Extended Sequence Number {A1,A0} | 1585* | | 1586* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1587* | 0x0 | 1588* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1589* 1590* AAD Format with 64-bit Extended Sequence Number 1591* 1592* poly = x^128 + x^127 + x^126 + x^121 + 1 1593* 1594*****************************************************************************/ 1595SYM_FUNC_START(aesni_gcm_dec) 1596 FUNC_SAVE 1597 1598 GCM_INIT %arg6, arg7, arg8, arg9 1599 GCM_ENC_DEC dec 1600 GCM_COMPLETE arg10, arg11 1601 FUNC_RESTORE 1602 ret 1603SYM_FUNC_END(aesni_gcm_dec) 1604 1605 1606/***************************************************************************** 1607* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1608* struct gcm_context_data *data 1609* // Context data 1610* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1611* const u8 *in, // Plaintext input 1612* u64 plaintext_len, // Length of data in bytes for encryption. 1613* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1614* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1615* // concatenated with 0x00000001. 16-byte aligned pointer. 1616* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1617* const u8 *aad, // Additional Authentication Data (AAD) 1618* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1619* u8 *auth_tag, // Authenticated Tag output. 1620* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1621* // 12 or 8. 1622* 1623* Assumptions: 1624* 1625* keys: 1626* keys are pre-expanded and aligned to 16 bytes. we are using the 1627* first set of 11 keys in the data structure void *aes_ctx 1628* 1629* 1630* iv: 1631* 0 1 2 3 1632* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1633* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1634* | Salt (From the SA) | 1635* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1636* | Initialization Vector | 1637* | (This is the sequence number from IPSec header) | 1638* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1639* | 0x1 | 1640* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1641* 1642* 1643* 1644* AAD: 1645* AAD padded to 128 bits with 0 1646* for example, assume AAD is a u32 vector 1647* 1648* if AAD is 8 bytes: 1649* AAD[3] = {A0, A1}; 1650* padded AAD in xmm register = {A1 A0 0 0} 1651* 1652* 0 1 2 3 1653* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1654* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1655* | SPI (A1) | 1656* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1657* | 32-bit Sequence Number (A0) | 1658* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1659* | 0x0 | 1660* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1661* 1662* AAD Format with 32-bit Sequence Number 1663* 1664* if AAD is 12 bytes: 1665* AAD[3] = {A0, A1, A2}; 1666* padded AAD in xmm register = {A2 A1 A0 0} 1667* 1668* 0 1 2 3 1669* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1670* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1671* | SPI (A2) | 1672* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1673* | 64-bit Extended Sequence Number {A1,A0} | 1674* | | 1675* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1676* | 0x0 | 1677* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1678* 1679* AAD Format with 64-bit Extended Sequence Number 1680* 1681* poly = x^128 + x^127 + x^126 + x^121 + 1 1682***************************************************************************/ 1683SYM_FUNC_START(aesni_gcm_enc) 1684 FUNC_SAVE 1685 1686 GCM_INIT %arg6, arg7, arg8, arg9 1687 GCM_ENC_DEC enc 1688 1689 GCM_COMPLETE arg10, arg11 1690 FUNC_RESTORE 1691 ret 1692SYM_FUNC_END(aesni_gcm_enc) 1693 1694/***************************************************************************** 1695* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1696* struct gcm_context_data *data, 1697* // context data 1698* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1699* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1700* // concatenated with 0x00000001. 16-byte aligned pointer. 1701* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1702* const u8 *aad, // Additional Authentication Data (AAD) 1703* u64 aad_len) // Length of AAD in bytes. 1704*/ 1705SYM_FUNC_START(aesni_gcm_init) 1706 FUNC_SAVE 1707 GCM_INIT %arg3, %arg4,%arg5, %arg6 1708 FUNC_RESTORE 1709 ret 1710SYM_FUNC_END(aesni_gcm_init) 1711 1712/***************************************************************************** 1713* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1714* struct gcm_context_data *data, 1715* // context data 1716* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1717* const u8 *in, // Plaintext input 1718* u64 plaintext_len, // Length of data in bytes for encryption. 1719*/ 1720SYM_FUNC_START(aesni_gcm_enc_update) 1721 FUNC_SAVE 1722 GCM_ENC_DEC enc 1723 FUNC_RESTORE 1724 ret 1725SYM_FUNC_END(aesni_gcm_enc_update) 1726 1727/***************************************************************************** 1728* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1729* struct gcm_context_data *data, 1730* // context data 1731* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1732* const u8 *in, // Plaintext input 1733* u64 plaintext_len, // Length of data in bytes for encryption. 1734*/ 1735SYM_FUNC_START(aesni_gcm_dec_update) 1736 FUNC_SAVE 1737 GCM_ENC_DEC dec 1738 FUNC_RESTORE 1739 ret 1740SYM_FUNC_END(aesni_gcm_dec_update) 1741 1742/***************************************************************************** 1743* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1744* struct gcm_context_data *data, 1745* // context data 1746* u8 *auth_tag, // Authenticated Tag output. 1747* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1748* // 12 or 8. 1749*/ 1750SYM_FUNC_START(aesni_gcm_finalize) 1751 FUNC_SAVE 1752 GCM_COMPLETE %arg3 %arg4 1753 FUNC_RESTORE 1754 ret 1755SYM_FUNC_END(aesni_gcm_finalize) 1756 1757#endif 1758 1759 1760SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) 1761SYM_FUNC_START_LOCAL(_key_expansion_256a) 1762 pshufd $0b11111111, %xmm1, %xmm1 1763 shufps $0b00010000, %xmm0, %xmm4 1764 pxor %xmm4, %xmm0 1765 shufps $0b10001100, %xmm0, %xmm4 1766 pxor %xmm4, %xmm0 1767 pxor %xmm1, %xmm0 1768 movaps %xmm0, (TKEYP) 1769 add $0x10, TKEYP 1770 ret 1771SYM_FUNC_END(_key_expansion_256a) 1772SYM_FUNC_END_ALIAS(_key_expansion_128) 1773 1774SYM_FUNC_START_LOCAL(_key_expansion_192a) 1775 pshufd $0b01010101, %xmm1, %xmm1 1776 shufps $0b00010000, %xmm0, %xmm4 1777 pxor %xmm4, %xmm0 1778 shufps $0b10001100, %xmm0, %xmm4 1779 pxor %xmm4, %xmm0 1780 pxor %xmm1, %xmm0 1781 1782 movaps %xmm2, %xmm5 1783 movaps %xmm2, %xmm6 1784 pslldq $4, %xmm5 1785 pshufd $0b11111111, %xmm0, %xmm3 1786 pxor %xmm3, %xmm2 1787 pxor %xmm5, %xmm2 1788 1789 movaps %xmm0, %xmm1 1790 shufps $0b01000100, %xmm0, %xmm6 1791 movaps %xmm6, (TKEYP) 1792 shufps $0b01001110, %xmm2, %xmm1 1793 movaps %xmm1, 0x10(TKEYP) 1794 add $0x20, TKEYP 1795 ret 1796SYM_FUNC_END(_key_expansion_192a) 1797 1798SYM_FUNC_START_LOCAL(_key_expansion_192b) 1799 pshufd $0b01010101, %xmm1, %xmm1 1800 shufps $0b00010000, %xmm0, %xmm4 1801 pxor %xmm4, %xmm0 1802 shufps $0b10001100, %xmm0, %xmm4 1803 pxor %xmm4, %xmm0 1804 pxor %xmm1, %xmm0 1805 1806 movaps %xmm2, %xmm5 1807 pslldq $4, %xmm5 1808 pshufd $0b11111111, %xmm0, %xmm3 1809 pxor %xmm3, %xmm2 1810 pxor %xmm5, %xmm2 1811 1812 movaps %xmm0, (TKEYP) 1813 add $0x10, TKEYP 1814 ret 1815SYM_FUNC_END(_key_expansion_192b) 1816 1817SYM_FUNC_START_LOCAL(_key_expansion_256b) 1818 pshufd $0b10101010, %xmm1, %xmm1 1819 shufps $0b00010000, %xmm2, %xmm4 1820 pxor %xmm4, %xmm2 1821 shufps $0b10001100, %xmm2, %xmm4 1822 pxor %xmm4, %xmm2 1823 pxor %xmm1, %xmm2 1824 movaps %xmm2, (TKEYP) 1825 add $0x10, TKEYP 1826 ret 1827SYM_FUNC_END(_key_expansion_256b) 1828 1829/* 1830 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1831 * unsigned int key_len) 1832 */ 1833SYM_FUNC_START(aesni_set_key) 1834 FRAME_BEGIN 1835#ifndef __x86_64__ 1836 pushl KEYP 1837 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1838 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1839 movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1840#endif 1841 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1842 movaps %xmm0, (KEYP) 1843 lea 0x10(KEYP), TKEYP # key addr 1844 movl %edx, 480(KEYP) 1845 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1846 cmp $24, %dl 1847 jb .Lenc_key128 1848 je .Lenc_key192 1849 movups 0x10(UKEYP), %xmm2 # other user key 1850 movaps %xmm2, (TKEYP) 1851 add $0x10, TKEYP 1852 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1853 call _key_expansion_256a 1854 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1855 call _key_expansion_256b 1856 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1857 call _key_expansion_256a 1858 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1859 call _key_expansion_256b 1860 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1861 call _key_expansion_256a 1862 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1863 call _key_expansion_256b 1864 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1865 call _key_expansion_256a 1866 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1867 call _key_expansion_256b 1868 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1869 call _key_expansion_256a 1870 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1871 call _key_expansion_256b 1872 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1873 call _key_expansion_256a 1874 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1875 call _key_expansion_256b 1876 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1877 call _key_expansion_256a 1878 jmp .Ldec_key 1879.Lenc_key192: 1880 movq 0x10(UKEYP), %xmm2 # other user key 1881 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1882 call _key_expansion_192a 1883 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1884 call _key_expansion_192b 1885 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1886 call _key_expansion_192a 1887 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1888 call _key_expansion_192b 1889 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1890 call _key_expansion_192a 1891 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1892 call _key_expansion_192b 1893 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1894 call _key_expansion_192a 1895 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1896 call _key_expansion_192b 1897 jmp .Ldec_key 1898.Lenc_key128: 1899 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1900 call _key_expansion_128 1901 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1902 call _key_expansion_128 1903 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1904 call _key_expansion_128 1905 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1906 call _key_expansion_128 1907 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1908 call _key_expansion_128 1909 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1910 call _key_expansion_128 1911 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1912 call _key_expansion_128 1913 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1914 call _key_expansion_128 1915 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1916 call _key_expansion_128 1917 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1918 call _key_expansion_128 1919.Ldec_key: 1920 sub $0x10, TKEYP 1921 movaps (KEYP), %xmm0 1922 movaps (TKEYP), %xmm1 1923 movaps %xmm0, 240(TKEYP) 1924 movaps %xmm1, 240(KEYP) 1925 add $0x10, KEYP 1926 lea 240-16(TKEYP), UKEYP 1927.align 4 1928.Ldec_key_loop: 1929 movaps (KEYP), %xmm0 1930 AESIMC %xmm0 %xmm1 1931 movaps %xmm1, (UKEYP) 1932 add $0x10, KEYP 1933 sub $0x10, UKEYP 1934 cmp TKEYP, KEYP 1935 jb .Ldec_key_loop 1936 xor AREG, AREG 1937#ifndef __x86_64__ 1938 popl KEYP 1939#endif 1940 FRAME_END 1941 ret 1942SYM_FUNC_END(aesni_set_key) 1943 1944/* 1945 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1946 */ 1947SYM_FUNC_START(aesni_enc) 1948 FRAME_BEGIN 1949#ifndef __x86_64__ 1950 pushl KEYP 1951 pushl KLEN 1952 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1953 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1954 movl (FRAME_OFFSET+20)(%esp), INP # src 1955#endif 1956 movl 480(KEYP), KLEN # key length 1957 movups (INP), STATE # input 1958 call _aesni_enc1 1959 movups STATE, (OUTP) # output 1960#ifndef __x86_64__ 1961 popl KLEN 1962 popl KEYP 1963#endif 1964 FRAME_END 1965 ret 1966SYM_FUNC_END(aesni_enc) 1967 1968/* 1969 * _aesni_enc1: internal ABI 1970 * input: 1971 * KEYP: key struct pointer 1972 * KLEN: round count 1973 * STATE: initial state (input) 1974 * output: 1975 * STATE: finial state (output) 1976 * changed: 1977 * KEY 1978 * TKEYP (T1) 1979 */ 1980SYM_FUNC_START_LOCAL(_aesni_enc1) 1981 movaps (KEYP), KEY # key 1982 mov KEYP, TKEYP 1983 pxor KEY, STATE # round 0 1984 add $0x30, TKEYP 1985 cmp $24, KLEN 1986 jb .Lenc128 1987 lea 0x20(TKEYP), TKEYP 1988 je .Lenc192 1989 add $0x20, TKEYP 1990 movaps -0x60(TKEYP), KEY 1991 AESENC KEY STATE 1992 movaps -0x50(TKEYP), KEY 1993 AESENC KEY STATE 1994.align 4 1995.Lenc192: 1996 movaps -0x40(TKEYP), KEY 1997 AESENC KEY STATE 1998 movaps -0x30(TKEYP), KEY 1999 AESENC KEY STATE 2000.align 4 2001.Lenc128: 2002 movaps -0x20(TKEYP), KEY 2003 AESENC KEY STATE 2004 movaps -0x10(TKEYP), KEY 2005 AESENC KEY STATE 2006 movaps (TKEYP), KEY 2007 AESENC KEY STATE 2008 movaps 0x10(TKEYP), KEY 2009 AESENC KEY STATE 2010 movaps 0x20(TKEYP), KEY 2011 AESENC KEY STATE 2012 movaps 0x30(TKEYP), KEY 2013 AESENC KEY STATE 2014 movaps 0x40(TKEYP), KEY 2015 AESENC KEY STATE 2016 movaps 0x50(TKEYP), KEY 2017 AESENC KEY STATE 2018 movaps 0x60(TKEYP), KEY 2019 AESENC KEY STATE 2020 movaps 0x70(TKEYP), KEY 2021 AESENCLAST KEY STATE 2022 ret 2023SYM_FUNC_END(_aesni_enc1) 2024 2025/* 2026 * _aesni_enc4: internal ABI 2027 * input: 2028 * KEYP: key struct pointer 2029 * KLEN: round count 2030 * STATE1: initial state (input) 2031 * STATE2 2032 * STATE3 2033 * STATE4 2034 * output: 2035 * STATE1: finial state (output) 2036 * STATE2 2037 * STATE3 2038 * STATE4 2039 * changed: 2040 * KEY 2041 * TKEYP (T1) 2042 */ 2043SYM_FUNC_START_LOCAL(_aesni_enc4) 2044 movaps (KEYP), KEY # key 2045 mov KEYP, TKEYP 2046 pxor KEY, STATE1 # round 0 2047 pxor KEY, STATE2 2048 pxor KEY, STATE3 2049 pxor KEY, STATE4 2050 add $0x30, TKEYP 2051 cmp $24, KLEN 2052 jb .L4enc128 2053 lea 0x20(TKEYP), TKEYP 2054 je .L4enc192 2055 add $0x20, TKEYP 2056 movaps -0x60(TKEYP), KEY 2057 AESENC KEY STATE1 2058 AESENC KEY STATE2 2059 AESENC KEY STATE3 2060 AESENC KEY STATE4 2061 movaps -0x50(TKEYP), KEY 2062 AESENC KEY STATE1 2063 AESENC KEY STATE2 2064 AESENC KEY STATE3 2065 AESENC KEY STATE4 2066#.align 4 2067.L4enc192: 2068 movaps -0x40(TKEYP), KEY 2069 AESENC KEY STATE1 2070 AESENC KEY STATE2 2071 AESENC KEY STATE3 2072 AESENC KEY STATE4 2073 movaps -0x30(TKEYP), KEY 2074 AESENC KEY STATE1 2075 AESENC KEY STATE2 2076 AESENC KEY STATE3 2077 AESENC KEY STATE4 2078#.align 4 2079.L4enc128: 2080 movaps -0x20(TKEYP), KEY 2081 AESENC KEY STATE1 2082 AESENC KEY STATE2 2083 AESENC KEY STATE3 2084 AESENC KEY STATE4 2085 movaps -0x10(TKEYP), KEY 2086 AESENC KEY STATE1 2087 AESENC KEY STATE2 2088 AESENC KEY STATE3 2089 AESENC KEY STATE4 2090 movaps (TKEYP), KEY 2091 AESENC KEY STATE1 2092 AESENC KEY STATE2 2093 AESENC KEY STATE3 2094 AESENC KEY STATE4 2095 movaps 0x10(TKEYP), KEY 2096 AESENC KEY STATE1 2097 AESENC KEY STATE2 2098 AESENC KEY STATE3 2099 AESENC KEY STATE4 2100 movaps 0x20(TKEYP), KEY 2101 AESENC KEY STATE1 2102 AESENC KEY STATE2 2103 AESENC KEY STATE3 2104 AESENC KEY STATE4 2105 movaps 0x30(TKEYP), KEY 2106 AESENC KEY STATE1 2107 AESENC KEY STATE2 2108 AESENC KEY STATE3 2109 AESENC KEY STATE4 2110 movaps 0x40(TKEYP), KEY 2111 AESENC KEY STATE1 2112 AESENC KEY STATE2 2113 AESENC KEY STATE3 2114 AESENC KEY STATE4 2115 movaps 0x50(TKEYP), KEY 2116 AESENC KEY STATE1 2117 AESENC KEY STATE2 2118 AESENC KEY STATE3 2119 AESENC KEY STATE4 2120 movaps 0x60(TKEYP), KEY 2121 AESENC KEY STATE1 2122 AESENC KEY STATE2 2123 AESENC KEY STATE3 2124 AESENC KEY STATE4 2125 movaps 0x70(TKEYP), KEY 2126 AESENCLAST KEY STATE1 # last round 2127 AESENCLAST KEY STATE2 2128 AESENCLAST KEY STATE3 2129 AESENCLAST KEY STATE4 2130 ret 2131SYM_FUNC_END(_aesni_enc4) 2132 2133/* 2134 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 2135 */ 2136SYM_FUNC_START(aesni_dec) 2137 FRAME_BEGIN 2138#ifndef __x86_64__ 2139 pushl KEYP 2140 pushl KLEN 2141 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2142 movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2143 movl (FRAME_OFFSET+20)(%esp), INP # src 2144#endif 2145 mov 480(KEYP), KLEN # key length 2146 add $240, KEYP 2147 movups (INP), STATE # input 2148 call _aesni_dec1 2149 movups STATE, (OUTP) #output 2150#ifndef __x86_64__ 2151 popl KLEN 2152 popl KEYP 2153#endif 2154 FRAME_END 2155 ret 2156SYM_FUNC_END(aesni_dec) 2157 2158/* 2159 * _aesni_dec1: internal ABI 2160 * input: 2161 * KEYP: key struct pointer 2162 * KLEN: key length 2163 * STATE: initial state (input) 2164 * output: 2165 * STATE: finial state (output) 2166 * changed: 2167 * KEY 2168 * TKEYP (T1) 2169 */ 2170SYM_FUNC_START_LOCAL(_aesni_dec1) 2171 movaps (KEYP), KEY # key 2172 mov KEYP, TKEYP 2173 pxor KEY, STATE # round 0 2174 add $0x30, TKEYP 2175 cmp $24, KLEN 2176 jb .Ldec128 2177 lea 0x20(TKEYP), TKEYP 2178 je .Ldec192 2179 add $0x20, TKEYP 2180 movaps -0x60(TKEYP), KEY 2181 AESDEC KEY STATE 2182 movaps -0x50(TKEYP), KEY 2183 AESDEC KEY STATE 2184.align 4 2185.Ldec192: 2186 movaps -0x40(TKEYP), KEY 2187 AESDEC KEY STATE 2188 movaps -0x30(TKEYP), KEY 2189 AESDEC KEY STATE 2190.align 4 2191.Ldec128: 2192 movaps -0x20(TKEYP), KEY 2193 AESDEC KEY STATE 2194 movaps -0x10(TKEYP), KEY 2195 AESDEC KEY STATE 2196 movaps (TKEYP), KEY 2197 AESDEC KEY STATE 2198 movaps 0x10(TKEYP), KEY 2199 AESDEC KEY STATE 2200 movaps 0x20(TKEYP), KEY 2201 AESDEC KEY STATE 2202 movaps 0x30(TKEYP), KEY 2203 AESDEC KEY STATE 2204 movaps 0x40(TKEYP), KEY 2205 AESDEC KEY STATE 2206 movaps 0x50(TKEYP), KEY 2207 AESDEC KEY STATE 2208 movaps 0x60(TKEYP), KEY 2209 AESDEC KEY STATE 2210 movaps 0x70(TKEYP), KEY 2211 AESDECLAST KEY STATE 2212 ret 2213SYM_FUNC_END(_aesni_dec1) 2214 2215/* 2216 * _aesni_dec4: internal ABI 2217 * input: 2218 * KEYP: key struct pointer 2219 * KLEN: key length 2220 * STATE1: initial state (input) 2221 * STATE2 2222 * STATE3 2223 * STATE4 2224 * output: 2225 * STATE1: finial state (output) 2226 * STATE2 2227 * STATE3 2228 * STATE4 2229 * changed: 2230 * KEY 2231 * TKEYP (T1) 2232 */ 2233SYM_FUNC_START_LOCAL(_aesni_dec4) 2234 movaps (KEYP), KEY # key 2235 mov KEYP, TKEYP 2236 pxor KEY, STATE1 # round 0 2237 pxor KEY, STATE2 2238 pxor KEY, STATE3 2239 pxor KEY, STATE4 2240 add $0x30, TKEYP 2241 cmp $24, KLEN 2242 jb .L4dec128 2243 lea 0x20(TKEYP), TKEYP 2244 je .L4dec192 2245 add $0x20, TKEYP 2246 movaps -0x60(TKEYP), KEY 2247 AESDEC KEY STATE1 2248 AESDEC KEY STATE2 2249 AESDEC KEY STATE3 2250 AESDEC KEY STATE4 2251 movaps -0x50(TKEYP), KEY 2252 AESDEC KEY STATE1 2253 AESDEC KEY STATE2 2254 AESDEC KEY STATE3 2255 AESDEC KEY STATE4 2256.align 4 2257.L4dec192: 2258 movaps -0x40(TKEYP), KEY 2259 AESDEC KEY STATE1 2260 AESDEC KEY STATE2 2261 AESDEC KEY STATE3 2262 AESDEC KEY STATE4 2263 movaps -0x30(TKEYP), KEY 2264 AESDEC KEY STATE1 2265 AESDEC KEY STATE2 2266 AESDEC KEY STATE3 2267 AESDEC KEY STATE4 2268.align 4 2269.L4dec128: 2270 movaps -0x20(TKEYP), KEY 2271 AESDEC KEY STATE1 2272 AESDEC KEY STATE2 2273 AESDEC KEY STATE3 2274 AESDEC KEY STATE4 2275 movaps -0x10(TKEYP), KEY 2276 AESDEC KEY STATE1 2277 AESDEC KEY STATE2 2278 AESDEC KEY STATE3 2279 AESDEC KEY STATE4 2280 movaps (TKEYP), KEY 2281 AESDEC KEY STATE1 2282 AESDEC KEY STATE2 2283 AESDEC KEY STATE3 2284 AESDEC KEY STATE4 2285 movaps 0x10(TKEYP), KEY 2286 AESDEC KEY STATE1 2287 AESDEC KEY STATE2 2288 AESDEC KEY STATE3 2289 AESDEC KEY STATE4 2290 movaps 0x20(TKEYP), KEY 2291 AESDEC KEY STATE1 2292 AESDEC KEY STATE2 2293 AESDEC KEY STATE3 2294 AESDEC KEY STATE4 2295 movaps 0x30(TKEYP), KEY 2296 AESDEC KEY STATE1 2297 AESDEC KEY STATE2 2298 AESDEC KEY STATE3 2299 AESDEC KEY STATE4 2300 movaps 0x40(TKEYP), KEY 2301 AESDEC KEY STATE1 2302 AESDEC KEY STATE2 2303 AESDEC KEY STATE3 2304 AESDEC KEY STATE4 2305 movaps 0x50(TKEYP), KEY 2306 AESDEC KEY STATE1 2307 AESDEC KEY STATE2 2308 AESDEC KEY STATE3 2309 AESDEC KEY STATE4 2310 movaps 0x60(TKEYP), KEY 2311 AESDEC KEY STATE1 2312 AESDEC KEY STATE2 2313 AESDEC KEY STATE3 2314 AESDEC KEY STATE4 2315 movaps 0x70(TKEYP), KEY 2316 AESDECLAST KEY STATE1 # last round 2317 AESDECLAST KEY STATE2 2318 AESDECLAST KEY STATE3 2319 AESDECLAST KEY STATE4 2320 ret 2321SYM_FUNC_END(_aesni_dec4) 2322 2323/* 2324 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2325 * size_t len) 2326 */ 2327SYM_FUNC_START(aesni_ecb_enc) 2328 FRAME_BEGIN 2329#ifndef __x86_64__ 2330 pushl LEN 2331 pushl KEYP 2332 pushl KLEN 2333 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2334 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2335 movl (FRAME_OFFSET+24)(%esp), INP # src 2336 movl (FRAME_OFFSET+28)(%esp), LEN # len 2337#endif 2338 test LEN, LEN # check length 2339 jz .Lecb_enc_ret 2340 mov 480(KEYP), KLEN 2341 cmp $16, LEN 2342 jb .Lecb_enc_ret 2343 cmp $64, LEN 2344 jb .Lecb_enc_loop1 2345.align 4 2346.Lecb_enc_loop4: 2347 movups (INP), STATE1 2348 movups 0x10(INP), STATE2 2349 movups 0x20(INP), STATE3 2350 movups 0x30(INP), STATE4 2351 call _aesni_enc4 2352 movups STATE1, (OUTP) 2353 movups STATE2, 0x10(OUTP) 2354 movups STATE3, 0x20(OUTP) 2355 movups STATE4, 0x30(OUTP) 2356 sub $64, LEN 2357 add $64, INP 2358 add $64, OUTP 2359 cmp $64, LEN 2360 jge .Lecb_enc_loop4 2361 cmp $16, LEN 2362 jb .Lecb_enc_ret 2363.align 4 2364.Lecb_enc_loop1: 2365 movups (INP), STATE1 2366 call _aesni_enc1 2367 movups STATE1, (OUTP) 2368 sub $16, LEN 2369 add $16, INP 2370 add $16, OUTP 2371 cmp $16, LEN 2372 jge .Lecb_enc_loop1 2373.Lecb_enc_ret: 2374#ifndef __x86_64__ 2375 popl KLEN 2376 popl KEYP 2377 popl LEN 2378#endif 2379 FRAME_END 2380 ret 2381SYM_FUNC_END(aesni_ecb_enc) 2382 2383/* 2384 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2385 * size_t len); 2386 */ 2387SYM_FUNC_START(aesni_ecb_dec) 2388 FRAME_BEGIN 2389#ifndef __x86_64__ 2390 pushl LEN 2391 pushl KEYP 2392 pushl KLEN 2393 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2394 movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2395 movl (FRAME_OFFSET+24)(%esp), INP # src 2396 movl (FRAME_OFFSET+28)(%esp), LEN # len 2397#endif 2398 test LEN, LEN 2399 jz .Lecb_dec_ret 2400 mov 480(KEYP), KLEN 2401 add $240, KEYP 2402 cmp $16, LEN 2403 jb .Lecb_dec_ret 2404 cmp $64, LEN 2405 jb .Lecb_dec_loop1 2406.align 4 2407.Lecb_dec_loop4: 2408 movups (INP), STATE1 2409 movups 0x10(INP), STATE2 2410 movups 0x20(INP), STATE3 2411 movups 0x30(INP), STATE4 2412 call _aesni_dec4 2413 movups STATE1, (OUTP) 2414 movups STATE2, 0x10(OUTP) 2415 movups STATE3, 0x20(OUTP) 2416 movups STATE4, 0x30(OUTP) 2417 sub $64, LEN 2418 add $64, INP 2419 add $64, OUTP 2420 cmp $64, LEN 2421 jge .Lecb_dec_loop4 2422 cmp $16, LEN 2423 jb .Lecb_dec_ret 2424.align 4 2425.Lecb_dec_loop1: 2426 movups (INP), STATE1 2427 call _aesni_dec1 2428 movups STATE1, (OUTP) 2429 sub $16, LEN 2430 add $16, INP 2431 add $16, OUTP 2432 cmp $16, LEN 2433 jge .Lecb_dec_loop1 2434.Lecb_dec_ret: 2435#ifndef __x86_64__ 2436 popl KLEN 2437 popl KEYP 2438 popl LEN 2439#endif 2440 FRAME_END 2441 ret 2442SYM_FUNC_END(aesni_ecb_dec) 2443 2444/* 2445 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2446 * size_t len, u8 *iv) 2447 */ 2448SYM_FUNC_START(aesni_cbc_enc) 2449 FRAME_BEGIN 2450#ifndef __x86_64__ 2451 pushl IVP 2452 pushl LEN 2453 pushl KEYP 2454 pushl KLEN 2455 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2456 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2457 movl (FRAME_OFFSET+28)(%esp), INP # src 2458 movl (FRAME_OFFSET+32)(%esp), LEN # len 2459 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2460#endif 2461 cmp $16, LEN 2462 jb .Lcbc_enc_ret 2463 mov 480(KEYP), KLEN 2464 movups (IVP), STATE # load iv as initial state 2465.align 4 2466.Lcbc_enc_loop: 2467 movups (INP), IN # load input 2468 pxor IN, STATE 2469 call _aesni_enc1 2470 movups STATE, (OUTP) # store output 2471 sub $16, LEN 2472 add $16, INP 2473 add $16, OUTP 2474 cmp $16, LEN 2475 jge .Lcbc_enc_loop 2476 movups STATE, (IVP) 2477.Lcbc_enc_ret: 2478#ifndef __x86_64__ 2479 popl KLEN 2480 popl KEYP 2481 popl LEN 2482 popl IVP 2483#endif 2484 FRAME_END 2485 ret 2486SYM_FUNC_END(aesni_cbc_enc) 2487 2488/* 2489 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2490 * size_t len, u8 *iv) 2491 */ 2492SYM_FUNC_START(aesni_cbc_dec) 2493 FRAME_BEGIN 2494#ifndef __x86_64__ 2495 pushl IVP 2496 pushl LEN 2497 pushl KEYP 2498 pushl KLEN 2499 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2500 movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2501 movl (FRAME_OFFSET+28)(%esp), INP # src 2502 movl (FRAME_OFFSET+32)(%esp), LEN # len 2503 movl (FRAME_OFFSET+36)(%esp), IVP # iv 2504#endif 2505 cmp $16, LEN 2506 jb .Lcbc_dec_just_ret 2507 mov 480(KEYP), KLEN 2508 add $240, KEYP 2509 movups (IVP), IV 2510 cmp $64, LEN 2511 jb .Lcbc_dec_loop1 2512.align 4 2513.Lcbc_dec_loop4: 2514 movups (INP), IN1 2515 movaps IN1, STATE1 2516 movups 0x10(INP), IN2 2517 movaps IN2, STATE2 2518#ifdef __x86_64__ 2519 movups 0x20(INP), IN3 2520 movaps IN3, STATE3 2521 movups 0x30(INP), IN4 2522 movaps IN4, STATE4 2523#else 2524 movups 0x20(INP), IN1 2525 movaps IN1, STATE3 2526 movups 0x30(INP), IN2 2527 movaps IN2, STATE4 2528#endif 2529 call _aesni_dec4 2530 pxor IV, STATE1 2531#ifdef __x86_64__ 2532 pxor IN1, STATE2 2533 pxor IN2, STATE3 2534 pxor IN3, STATE4 2535 movaps IN4, IV 2536#else 2537 pxor IN1, STATE4 2538 movaps IN2, IV 2539 movups (INP), IN1 2540 pxor IN1, STATE2 2541 movups 0x10(INP), IN2 2542 pxor IN2, STATE3 2543#endif 2544 movups STATE1, (OUTP) 2545 movups STATE2, 0x10(OUTP) 2546 movups STATE3, 0x20(OUTP) 2547 movups STATE4, 0x30(OUTP) 2548 sub $64, LEN 2549 add $64, INP 2550 add $64, OUTP 2551 cmp $64, LEN 2552 jge .Lcbc_dec_loop4 2553 cmp $16, LEN 2554 jb .Lcbc_dec_ret 2555.align 4 2556.Lcbc_dec_loop1: 2557 movups (INP), IN 2558 movaps IN, STATE 2559 call _aesni_dec1 2560 pxor IV, STATE 2561 movups STATE, (OUTP) 2562 movaps IN, IV 2563 sub $16, LEN 2564 add $16, INP 2565 add $16, OUTP 2566 cmp $16, LEN 2567 jge .Lcbc_dec_loop1 2568.Lcbc_dec_ret: 2569 movups IV, (IVP) 2570.Lcbc_dec_just_ret: 2571#ifndef __x86_64__ 2572 popl KLEN 2573 popl KEYP 2574 popl LEN 2575 popl IVP 2576#endif 2577 FRAME_END 2578 ret 2579SYM_FUNC_END(aesni_cbc_dec) 2580 2581#ifdef __x86_64__ 2582.pushsection .rodata 2583.align 16 2584.Lbswap_mask: 2585 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2586.popsection 2587 2588/* 2589 * _aesni_inc_init: internal ABI 2590 * setup registers used by _aesni_inc 2591 * input: 2592 * IV 2593 * output: 2594 * CTR: == IV, in little endian 2595 * TCTR_LOW: == lower qword of CTR 2596 * INC: == 1, in little endian 2597 * BSWAP_MASK == endian swapping mask 2598 */ 2599SYM_FUNC_START_LOCAL(_aesni_inc_init) 2600 movaps .Lbswap_mask, BSWAP_MASK 2601 movaps IV, CTR 2602 PSHUFB_XMM BSWAP_MASK CTR 2603 mov $1, TCTR_LOW 2604 MOVQ_R64_XMM TCTR_LOW INC 2605 MOVQ_R64_XMM CTR TCTR_LOW 2606 ret 2607SYM_FUNC_END(_aesni_inc_init) 2608 2609/* 2610 * _aesni_inc: internal ABI 2611 * Increase IV by 1, IV is in big endian 2612 * input: 2613 * IV 2614 * CTR: == IV, in little endian 2615 * TCTR_LOW: == lower qword of CTR 2616 * INC: == 1, in little endian 2617 * BSWAP_MASK == endian swapping mask 2618 * output: 2619 * IV: Increase by 1 2620 * changed: 2621 * CTR: == output IV, in little endian 2622 * TCTR_LOW: == lower qword of CTR 2623 */ 2624SYM_FUNC_START_LOCAL(_aesni_inc) 2625 paddq INC, CTR 2626 add $1, TCTR_LOW 2627 jnc .Linc_low 2628 pslldq $8, INC 2629 paddq INC, CTR 2630 psrldq $8, INC 2631.Linc_low: 2632 movaps CTR, IV 2633 PSHUFB_XMM BSWAP_MASK IV 2634 ret 2635SYM_FUNC_END(_aesni_inc) 2636 2637/* 2638 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2639 * size_t len, u8 *iv) 2640 */ 2641SYM_FUNC_START(aesni_ctr_enc) 2642 FRAME_BEGIN 2643 cmp $16, LEN 2644 jb .Lctr_enc_just_ret 2645 mov 480(KEYP), KLEN 2646 movups (IVP), IV 2647 call _aesni_inc_init 2648 cmp $64, LEN 2649 jb .Lctr_enc_loop1 2650.align 4 2651.Lctr_enc_loop4: 2652 movaps IV, STATE1 2653 call _aesni_inc 2654 movups (INP), IN1 2655 movaps IV, STATE2 2656 call _aesni_inc 2657 movups 0x10(INP), IN2 2658 movaps IV, STATE3 2659 call _aesni_inc 2660 movups 0x20(INP), IN3 2661 movaps IV, STATE4 2662 call _aesni_inc 2663 movups 0x30(INP), IN4 2664 call _aesni_enc4 2665 pxor IN1, STATE1 2666 movups STATE1, (OUTP) 2667 pxor IN2, STATE2 2668 movups STATE2, 0x10(OUTP) 2669 pxor IN3, STATE3 2670 movups STATE3, 0x20(OUTP) 2671 pxor IN4, STATE4 2672 movups STATE4, 0x30(OUTP) 2673 sub $64, LEN 2674 add $64, INP 2675 add $64, OUTP 2676 cmp $64, LEN 2677 jge .Lctr_enc_loop4 2678 cmp $16, LEN 2679 jb .Lctr_enc_ret 2680.align 4 2681.Lctr_enc_loop1: 2682 movaps IV, STATE 2683 call _aesni_inc 2684 movups (INP), IN 2685 call _aesni_enc1 2686 pxor IN, STATE 2687 movups STATE, (OUTP) 2688 sub $16, LEN 2689 add $16, INP 2690 add $16, OUTP 2691 cmp $16, LEN 2692 jge .Lctr_enc_loop1 2693.Lctr_enc_ret: 2694 movups IV, (IVP) 2695.Lctr_enc_just_ret: 2696 FRAME_END 2697 ret 2698SYM_FUNC_END(aesni_ctr_enc) 2699 2700/* 2701 * _aesni_gf128mul_x_ble: internal ABI 2702 * Multiply in GF(2^128) for XTS IVs 2703 * input: 2704 * IV: current IV 2705 * GF128MUL_MASK == mask with 0x87 and 0x01 2706 * output: 2707 * IV: next IV 2708 * changed: 2709 * CTR: == temporary value 2710 */ 2711#define _aesni_gf128mul_x_ble() \ 2712 pshufd $0x13, IV, CTR; \ 2713 paddq IV, IV; \ 2714 psrad $31, CTR; \ 2715 pand GF128MUL_MASK, CTR; \ 2716 pxor CTR, IV; 2717 2718/* 2719 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst, 2720 * const u8 *src, bool enc, le128 *iv) 2721 */ 2722SYM_FUNC_START(aesni_xts_crypt8) 2723 FRAME_BEGIN 2724 cmpb $0, %cl 2725 movl $0, %ecx 2726 movl $240, %r10d 2727 leaq _aesni_enc4, %r11 2728 leaq _aesni_dec4, %rax 2729 cmovel %r10d, %ecx 2730 cmoveq %rax, %r11 2731 2732 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2733 movups (IVP), IV 2734 2735 mov 480(KEYP), KLEN 2736 addq %rcx, KEYP 2737 2738 movdqa IV, STATE1 2739 movdqu 0x00(INP), INC 2740 pxor INC, STATE1 2741 movdqu IV, 0x00(OUTP) 2742 2743 _aesni_gf128mul_x_ble() 2744 movdqa IV, STATE2 2745 movdqu 0x10(INP), INC 2746 pxor INC, STATE2 2747 movdqu IV, 0x10(OUTP) 2748 2749 _aesni_gf128mul_x_ble() 2750 movdqa IV, STATE3 2751 movdqu 0x20(INP), INC 2752 pxor INC, STATE3 2753 movdqu IV, 0x20(OUTP) 2754 2755 _aesni_gf128mul_x_ble() 2756 movdqa IV, STATE4 2757 movdqu 0x30(INP), INC 2758 pxor INC, STATE4 2759 movdqu IV, 0x30(OUTP) 2760 2761 CALL_NOSPEC %r11 2762 2763 movdqu 0x00(OUTP), INC 2764 pxor INC, STATE1 2765 movdqu STATE1, 0x00(OUTP) 2766 2767 _aesni_gf128mul_x_ble() 2768 movdqa IV, STATE1 2769 movdqu 0x40(INP), INC 2770 pxor INC, STATE1 2771 movdqu IV, 0x40(OUTP) 2772 2773 movdqu 0x10(OUTP), INC 2774 pxor INC, STATE2 2775 movdqu STATE2, 0x10(OUTP) 2776 2777 _aesni_gf128mul_x_ble() 2778 movdqa IV, STATE2 2779 movdqu 0x50(INP), INC 2780 pxor INC, STATE2 2781 movdqu IV, 0x50(OUTP) 2782 2783 movdqu 0x20(OUTP), INC 2784 pxor INC, STATE3 2785 movdqu STATE3, 0x20(OUTP) 2786 2787 _aesni_gf128mul_x_ble() 2788 movdqa IV, STATE3 2789 movdqu 0x60(INP), INC 2790 pxor INC, STATE3 2791 movdqu IV, 0x60(OUTP) 2792 2793 movdqu 0x30(OUTP), INC 2794 pxor INC, STATE4 2795 movdqu STATE4, 0x30(OUTP) 2796 2797 _aesni_gf128mul_x_ble() 2798 movdqa IV, STATE4 2799 movdqu 0x70(INP), INC 2800 pxor INC, STATE4 2801 movdqu IV, 0x70(OUTP) 2802 2803 _aesni_gf128mul_x_ble() 2804 movups IV, (IVP) 2805 2806 CALL_NOSPEC %r11 2807 2808 movdqu 0x40(OUTP), INC 2809 pxor INC, STATE1 2810 movdqu STATE1, 0x40(OUTP) 2811 2812 movdqu 0x50(OUTP), INC 2813 pxor INC, STATE2 2814 movdqu STATE2, 0x50(OUTP) 2815 2816 movdqu 0x60(OUTP), INC 2817 pxor INC, STATE3 2818 movdqu STATE3, 0x60(OUTP) 2819 2820 movdqu 0x70(OUTP), INC 2821 pxor INC, STATE4 2822 movdqu STATE4, 0x70(OUTP) 2823 2824 FRAME_END 2825 ret 2826SYM_FUNC_END(aesni_xts_crypt8) 2827 2828#endif 2829