xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision dcabb06bf127b3e0d3fbc94a2b65dd56c2725851)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17 *             Adrian Hoban <adrian.hoban@intel.com>
18 *             James Guilford (james.guilford@intel.com)
19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20 *             Tadeusz Struk (tadeusz.struk@intel.com)
21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22 *    Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 *    Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/frame.h>
30#include <asm/nospec-branch.h>
31
32/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register.  This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned).  It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ	movaps
41#define MOVUDQ	movups
42
43#ifdef __x86_64__
44
45# constants in mergeable sections, linker can reorder and merge
46.section	.rodata.cst16.POLY, "aM", @progbits, 16
47.align 16
48POLY:   .octa 0xC2000000000000000000000000000001
49.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
50.align 16
51TWOONE: .octa 0x00000001000000000000000000000001
52
53.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
54.align 16
55SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
56.section	.rodata.cst16.MASK1, "aM", @progbits, 16
57.align 16
58MASK1:      .octa 0x0000000000000000ffffffffffffffff
59.section	.rodata.cst16.MASK2, "aM", @progbits, 16
60.align 16
61MASK2:      .octa 0xffffffffffffffff0000000000000000
62.section	.rodata.cst16.ONE, "aM", @progbits, 16
63.align 16
64ONE:        .octa 0x00000000000000000000000000000001
65.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
66.align 16
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68.section	.rodata.cst16.dec, "aM", @progbits, 16
69.align 16
70dec:        .octa 0x1
71.section	.rodata.cst16.enc, "aM", @progbits, 16
72.align 16
73enc:        .octa 0x2
74
75# order of these constants should not change.
76# more specifically, ALL_F should follow SHIFT_MASK,
77# and zero should follow ALL_F
78.section	.rodata, "a", @progbits
79.align 16
80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
82            .octa 0x00000000000000000000000000000000
83
84.text
85
86
87#define	STACK_OFFSET    8*3
88
89#define AadHash 16*0
90#define AadLen 16*1
91#define InLen (16*1)+8
92#define PBlockEncKey 16*2
93#define OrigIV 16*3
94#define CurCount 16*4
95#define PBlockLen 16*5
96#define	HashKey		16*6	// store HashKey <<1 mod poly here
97#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
98#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
99#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
100#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
101				// bits of  HashKey <<1 mod poly here
102				//(for Karatsuba purposes)
103#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
104				// bits of  HashKey^2 <<1 mod poly here
105				// (for Karatsuba purposes)
106#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
107				// bits of  HashKey^3 <<1 mod poly here
108				// (for Karatsuba purposes)
109#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
110				// bits of  HashKey^4 <<1 mod poly here
111				// (for Karatsuba purposes)
112
113#define arg1 rdi
114#define arg2 rsi
115#define arg3 rdx
116#define arg4 rcx
117#define arg5 r8
118#define arg6 r9
119#define arg7 STACK_OFFSET+8(%rsp)
120#define arg8 STACK_OFFSET+16(%rsp)
121#define arg9 STACK_OFFSET+24(%rsp)
122#define arg10 STACK_OFFSET+32(%rsp)
123#define arg11 STACK_OFFSET+40(%rsp)
124#define keysize 2*15*16(%arg1)
125#endif
126
127
128#define STATE1	%xmm0
129#define STATE2	%xmm4
130#define STATE3	%xmm5
131#define STATE4	%xmm6
132#define STATE	STATE1
133#define IN1	%xmm1
134#define IN2	%xmm7
135#define IN3	%xmm8
136#define IN4	%xmm9
137#define IN	IN1
138#define KEY	%xmm2
139#define IV	%xmm3
140
141#define BSWAP_MASK %xmm10
142#define CTR	%xmm11
143#define INC	%xmm12
144
145#define GF128MUL_MASK %xmm7
146
147#ifdef __x86_64__
148#define AREG	%rax
149#define KEYP	%rdi
150#define OUTP	%rsi
151#define UKEYP	OUTP
152#define INP	%rdx
153#define LEN	%rcx
154#define IVP	%r8
155#define KLEN	%r9d
156#define T1	%r10
157#define TKEYP	T1
158#define T2	%r11
159#define TCTR_LOW T2
160#else
161#define AREG	%eax
162#define KEYP	%edi
163#define OUTP	AREG
164#define UKEYP	OUTP
165#define INP	%edx
166#define LEN	%esi
167#define IVP	%ebp
168#define KLEN	%ebx
169#define T1	%ecx
170#define TKEYP	T1
171#endif
172
173.macro FUNC_SAVE
174	push	%r12
175	push	%r13
176	push	%r14
177#
178# states of %xmm registers %xmm6:%xmm15 not saved
179# all %xmm registers are clobbered
180#
181.endm
182
183
184.macro FUNC_RESTORE
185	pop	%r14
186	pop	%r13
187	pop	%r12
188.endm
189
190# Precompute hashkeys.
191# Input: Hash subkey.
192# Output: HashKeys stored in gcm_context_data.  Only needs to be called
193# once per key.
194# clobbers r12, and tmp xmm registers.
195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
196	mov	\SUBKEY, %r12
197	movdqu	(%r12), \TMP3
198	movdqa	SHUF_MASK(%rip), \TMP2
199	pshufb	\TMP2, \TMP3
200
201	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
202
203	movdqa	\TMP3, \TMP2
204	psllq	$1, \TMP3
205	psrlq	$63, \TMP2
206	movdqa	\TMP2, \TMP1
207	pslldq	$8, \TMP2
208	psrldq	$8, \TMP1
209	por	\TMP2, \TMP3
210
211	# reduce HashKey<<1
212
213	pshufd	$0x24, \TMP1, \TMP2
214	pcmpeqd TWOONE(%rip), \TMP2
215	pand	POLY(%rip), \TMP2
216	pxor	\TMP2, \TMP3
217	movdqu	\TMP3, HashKey(%arg2)
218
219	movdqa	   \TMP3, \TMP5
220	pshufd	   $78, \TMP3, \TMP1
221	pxor	   \TMP3, \TMP1
222	movdqu	   \TMP1, HashKey_k(%arg2)
223
224	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225# TMP5 = HashKey^2<<1 (mod poly)
226	movdqu	   \TMP5, HashKey_2(%arg2)
227# HashKey_2 = HashKey^2<<1 (mod poly)
228	pshufd	   $78, \TMP5, \TMP1
229	pxor	   \TMP5, \TMP1
230	movdqu	   \TMP1, HashKey_2_k(%arg2)
231
232	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
233# TMP5 = HashKey^3<<1 (mod poly)
234	movdqu	   \TMP5, HashKey_3(%arg2)
235	pshufd	   $78, \TMP5, \TMP1
236	pxor	   \TMP5, \TMP1
237	movdqu	   \TMP1, HashKey_3_k(%arg2)
238
239	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
240# TMP5 = HashKey^3<<1 (mod poly)
241	movdqu	   \TMP5, HashKey_4(%arg2)
242	pshufd	   $78, \TMP5, \TMP1
243	pxor	   \TMP5, \TMP1
244	movdqu	   \TMP1, HashKey_4_k(%arg2)
245.endm
246
247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
249.macro GCM_INIT Iv SUBKEY AAD AADLEN
250	mov \AADLEN, %r11
251	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
252	xor %r11d, %r11d
253	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
254	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
255	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
256	mov \Iv, %rax
257	movdqu (%rax), %xmm0
258	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
259
260	movdqa  SHUF_MASK(%rip), %xmm2
261	pshufb %xmm2, %xmm0
262	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
263
264	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
265	movdqu HashKey(%arg2), %xmm13
266
267	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
268	%xmm4, %xmm5, %xmm6
269.endm
270
271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
272# struct has been initialized by GCM_INIT.
273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
274# Clobbers rax, r10-r13, and xmm0-xmm15
275.macro GCM_ENC_DEC operation
276	movdqu AadHash(%arg2), %xmm8
277	movdqu HashKey(%arg2), %xmm13
278	add %arg5, InLen(%arg2)
279
280	xor %r11d, %r11d # initialise the data pointer offset as zero
281	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
282
283	sub %r11, %arg5		# sub partial block data used
284	mov %arg5, %r13		# save the number of bytes
285
286	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
287	mov %r13, %r12
288	# Encrypt/Decrypt first few blocks
289
290	and	$(3<<4), %r12
291	jz	_initial_num_blocks_is_0_\@
292	cmp	$(2<<4), %r12
293	jb	_initial_num_blocks_is_1_\@
294	je	_initial_num_blocks_is_2_\@
295_initial_num_blocks_is_3_\@:
296	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
298	sub	$48, %r13
299	jmp	_initial_blocks_\@
300_initial_num_blocks_is_2_\@:
301	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
303	sub	$32, %r13
304	jmp	_initial_blocks_\@
305_initial_num_blocks_is_1_\@:
306	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
308	sub	$16, %r13
309	jmp	_initial_blocks_\@
310_initial_num_blocks_is_0_\@:
311	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
313_initial_blocks_\@:
314
315	# Main loop - Encrypt/Decrypt remaining blocks
316
317	test	%r13, %r13
318	je	_zero_cipher_left_\@
319	sub	$64, %r13
320	je	_four_cipher_left_\@
321_crypt_by_4_\@:
322	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
323	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
324	%xmm7, %xmm8, enc
325	add	$64, %r11
326	sub	$64, %r13
327	jne	_crypt_by_4_\@
328_four_cipher_left_\@:
329	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
331_zero_cipher_left_\@:
332	movdqu %xmm8, AadHash(%arg2)
333	movdqu %xmm0, CurCount(%arg2)
334
335	mov	%arg5, %r13
336	and	$15, %r13			# %r13 = arg5 (mod 16)
337	je	_multiple_of_16_bytes_\@
338
339	mov %r13, PBlockLen(%arg2)
340
341	# Handle the last <16 Byte block separately
342	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
343	movdqu %xmm0, CurCount(%arg2)
344	movdqa SHUF_MASK(%rip), %xmm10
345	pshufb %xmm10, %xmm0
346
347	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
348	movdqu %xmm0, PBlockEncKey(%arg2)
349
350	cmp	$16, %arg5
351	jge _large_enough_update_\@
352
353	lea (%arg4,%r11,1), %r10
354	mov %r13, %r12
355	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
356	jmp _data_read_\@
357
358_large_enough_update_\@:
359	sub	$16, %r11
360	add	%r13, %r11
361
362	# receive the last <16 Byte block
363	movdqu	(%arg4, %r11, 1), %xmm1
364
365	sub	%r13, %r11
366	add	$16, %r11
367
368	lea	SHIFT_MASK+16(%rip), %r12
369	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
370	# (r13 is the number of bytes in plaintext mod 16)
371	sub	%r13, %r12
372	# get the appropriate shuffle mask
373	movdqu	(%r12), %xmm2
374	# shift right 16-r13 bytes
375	pshufb  %xmm2, %xmm1
376
377_data_read_\@:
378	lea ALL_F+16(%rip), %r12
379	sub %r13, %r12
380
381.ifc \operation, dec
382	movdqa  %xmm1, %xmm2
383.endif
384	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
385	movdqu	(%r12), %xmm1
386	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
387	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
388.ifc \operation, dec
389	pand    %xmm1, %xmm2
390	movdqa SHUF_MASK(%rip), %xmm10
391	pshufb %xmm10 ,%xmm2
392
393	pxor %xmm2, %xmm8
394.else
395	movdqa SHUF_MASK(%rip), %xmm10
396	pshufb %xmm10,%xmm0
397
398	pxor	%xmm0, %xmm8
399.endif
400
401	movdqu %xmm8, AadHash(%arg2)
402.ifc \operation, enc
403	# GHASH computation for the last <16 byte block
404	movdqa SHUF_MASK(%rip), %xmm10
405	# shuffle xmm0 back to output as ciphertext
406	pshufb %xmm10, %xmm0
407.endif
408
409	# Output %r13 bytes
410	movq %xmm0, %rax
411	cmp $8, %r13
412	jle _less_than_8_bytes_left_\@
413	mov %rax, (%arg3 , %r11, 1)
414	add $8, %r11
415	psrldq $8, %xmm0
416	movq %xmm0, %rax
417	sub $8, %r13
418_less_than_8_bytes_left_\@:
419	mov %al,  (%arg3, %r11, 1)
420	add $1, %r11
421	shr $8, %rax
422	sub $1, %r13
423	jne _less_than_8_bytes_left_\@
424_multiple_of_16_bytes_\@:
425.endm
426
427# GCM_COMPLETE Finishes update of tag of last partial block
428# Output: Authorization Tag (AUTH_TAG)
429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
431	movdqu AadHash(%arg2), %xmm8
432	movdqu HashKey(%arg2), %xmm13
433
434	mov PBlockLen(%arg2), %r12
435
436	test %r12, %r12
437	je _partial_done\@
438
439	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
440
441_partial_done\@:
442	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
443	shl	$3, %r12		  # convert into number of bits
444	movd	%r12d, %xmm15		  # len(A) in %xmm15
445	mov InLen(%arg2), %r12
446	shl     $3, %r12                  # len(C) in bits (*128)
447	movq    %r12, %xmm1
448
449	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
450	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
451	pxor	%xmm15, %xmm8
452	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
453	# final GHASH computation
454	movdqa SHUF_MASK(%rip), %xmm10
455	pshufb %xmm10, %xmm8
456
457	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
458	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
459	pxor	%xmm8, %xmm0
460_return_T_\@:
461	mov	\AUTHTAG, %r10                     # %r10 = authTag
462	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
463	cmp	$16, %r11
464	je	_T_16_\@
465	cmp	$8, %r11
466	jl	_T_4_\@
467_T_8_\@:
468	movq	%xmm0, %rax
469	mov	%rax, (%r10)
470	add	$8, %r10
471	sub	$8, %r11
472	psrldq	$8, %xmm0
473	test	%r11, %r11
474	je	_return_T_done_\@
475_T_4_\@:
476	movd	%xmm0, %eax
477	mov	%eax, (%r10)
478	add	$4, %r10
479	sub	$4, %r11
480	psrldq	$4, %xmm0
481	test	%r11, %r11
482	je	_return_T_done_\@
483_T_123_\@:
484	movd	%xmm0, %eax
485	cmp	$2, %r11
486	jl	_T_1_\@
487	mov	%ax, (%r10)
488	cmp	$2, %r11
489	je	_return_T_done_\@
490	add	$2, %r10
491	sar	$16, %eax
492_T_1_\@:
493	mov	%al, (%r10)
494	jmp	_return_T_done_\@
495_T_16_\@:
496	movdqu	%xmm0, (%r10)
497_return_T_done_\@:
498.endm
499
500#ifdef __x86_64__
501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
502*
503*
504* Input: A and B (128-bits each, bit-reflected)
505* Output: C = A*B*x mod poly, (i.e. >>1 )
506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
508*
509*/
510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
511	movdqa	  \GH, \TMP1
512	pshufd	  $78, \GH, \TMP2
513	pshufd	  $78, \HK, \TMP3
514	pxor	  \GH, \TMP2            # TMP2 = a1+a0
515	pxor	  \HK, \TMP3            # TMP3 = b1+b0
516	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
517	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
518	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
519	pxor	  \GH, \TMP2
520	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
521	movdqa	  \TMP2, \TMP3
522	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
523	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
524	pxor	  \TMP3, \GH
525	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
526
527        # first phase of the reduction
528
529	movdqa    \GH, \TMP2
530	movdqa    \GH, \TMP3
531	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
532					# in in order to perform
533					# independent shifts
534	pslld     $31, \TMP2            # packed right shift <<31
535	pslld     $30, \TMP3            # packed right shift <<30
536	pslld     $25, \TMP4            # packed right shift <<25
537	pxor      \TMP3, \TMP2          # xor the shifted versions
538	pxor      \TMP4, \TMP2
539	movdqa    \TMP2, \TMP5
540	psrldq    $4, \TMP5             # right shift TMP5 1 DW
541	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
542	pxor      \TMP2, \GH
543
544        # second phase of the reduction
545
546	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
547					# in in order to perform
548					# independent shifts
549	movdqa    \GH,\TMP3
550	movdqa    \GH,\TMP4
551	psrld     $1,\TMP2              # packed left shift >>1
552	psrld     $2,\TMP3              # packed left shift >>2
553	psrld     $7,\TMP4              # packed left shift >>7
554	pxor      \TMP3,\TMP2		# xor the shifted versions
555	pxor      \TMP4,\TMP2
556	pxor      \TMP5, \TMP2
557	pxor      \TMP2, \GH
558	pxor      \TMP1, \GH            # result is in TMP1
559.endm
560
561# Reads DLEN bytes starting at DPTR and stores in XMMDst
562# where 0 < DLEN < 16
563# Clobbers %rax, DLEN and XMM1
564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
565        cmp $8, \DLEN
566        jl _read_lt8_\@
567        mov (\DPTR), %rax
568        movq %rax, \XMMDst
569        sub $8, \DLEN
570        jz _done_read_partial_block_\@
571	xor %eax, %eax
572_read_next_byte_\@:
573        shl $8, %rax
574        mov 7(\DPTR, \DLEN, 1), %al
575        dec \DLEN
576        jnz _read_next_byte_\@
577        movq %rax, \XMM1
578	pslldq $8, \XMM1
579        por \XMM1, \XMMDst
580	jmp _done_read_partial_block_\@
581_read_lt8_\@:
582	xor %eax, %eax
583_read_next_byte_lt8_\@:
584        shl $8, %rax
585        mov -1(\DPTR, \DLEN, 1), %al
586        dec \DLEN
587        jnz _read_next_byte_lt8_\@
588        movq %rax, \XMMDst
589_done_read_partial_block_\@:
590.endm
591
592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
593# clobbers r10-11, xmm14
594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
595	TMP6 TMP7
596	MOVADQ	   SHUF_MASK(%rip), %xmm14
597	mov	   \AAD, %r10		# %r10 = AAD
598	mov	   \AADLEN, %r11		# %r11 = aadLen
599	pxor	   \TMP7, \TMP7
600	pxor	   \TMP6, \TMP6
601
602	cmp	   $16, %r11
603	jl	   _get_AAD_rest\@
604_get_AAD_blocks\@:
605	movdqu	   (%r10), \TMP7
606	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
607	pxor	   \TMP7, \TMP6
608	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
609	add	   $16, %r10
610	sub	   $16, %r11
611	cmp	   $16, %r11
612	jge	   _get_AAD_blocks\@
613
614	movdqu	   \TMP6, \TMP7
615
616	/* read the last <16B of AAD */
617_get_AAD_rest\@:
618	test	   %r11, %r11
619	je	   _get_AAD_done\@
620
621	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
622	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
623	pxor	   \TMP6, \TMP7
624	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
625	movdqu \TMP7, \TMP6
626
627_get_AAD_done\@:
628	movdqu \TMP6, AadHash(%arg2)
629.endm
630
631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
632# between update calls.
633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
637	AAD_HASH operation
638	mov 	PBlockLen(%arg2), %r13
639	test	%r13, %r13
640	je	_partial_block_done_\@	# Leave Macro if no partial blocks
641	# Read in input data without over reading
642	cmp	$16, \PLAIN_CYPH_LEN
643	jl	_fewer_than_16_bytes_\@
644	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
645	jmp	_data_read_\@
646
647_fewer_than_16_bytes_\@:
648	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
649	mov	\PLAIN_CYPH_LEN, %r12
650	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
651
652	mov PBlockLen(%arg2), %r13
653
654_data_read_\@:				# Finished reading in data
655
656	movdqu	PBlockEncKey(%arg2), %xmm9
657	movdqu	HashKey(%arg2), %xmm13
658
659	lea	SHIFT_MASK(%rip), %r12
660
661	# adjust the shuffle mask pointer to be able to shift r13 bytes
662	# r16-r13 is the number of bytes in plaintext mod 16)
663	add	%r13, %r12
664	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
665	pshufb	%xmm2, %xmm9		# shift right r13 bytes
666
667.ifc \operation, dec
668	movdqa	%xmm1, %xmm3
669	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
670
671	mov	\PLAIN_CYPH_LEN, %r10
672	add	%r13, %r10
673	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
674	sub	$16, %r10
675	# Determine if if partial block is not being filled and
676	# shift mask accordingly
677	jge	_no_extra_mask_1_\@
678	sub	%r10, %r12
679_no_extra_mask_1_\@:
680
681	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
682	# get the appropriate mask to mask out bottom r13 bytes of xmm9
683	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
684
685	pand	%xmm1, %xmm3
686	movdqa	SHUF_MASK(%rip), %xmm10
687	pshufb	%xmm10, %xmm3
688	pshufb	%xmm2, %xmm3
689	pxor	%xmm3, \AAD_HASH
690
691	test	%r10, %r10
692	jl	_partial_incomplete_1_\@
693
694	# GHASH computation for the last <16 Byte block
695	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
696	xor	%eax, %eax
697
698	mov	%rax, PBlockLen(%arg2)
699	jmp	_dec_done_\@
700_partial_incomplete_1_\@:
701	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
702_dec_done_\@:
703	movdqu	\AAD_HASH, AadHash(%arg2)
704.else
705	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
706
707	mov	\PLAIN_CYPH_LEN, %r10
708	add	%r13, %r10
709	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
710	sub	$16, %r10
711	# Determine if if partial block is not being filled and
712	# shift mask accordingly
713	jge	_no_extra_mask_2_\@
714	sub	%r10, %r12
715_no_extra_mask_2_\@:
716
717	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
718	# get the appropriate mask to mask out bottom r13 bytes of xmm9
719	pand	%xmm1, %xmm9
720
721	movdqa	SHUF_MASK(%rip), %xmm1
722	pshufb	%xmm1, %xmm9
723	pshufb	%xmm2, %xmm9
724	pxor	%xmm9, \AAD_HASH
725
726	test	%r10, %r10
727	jl	_partial_incomplete_2_\@
728
729	# GHASH computation for the last <16 Byte block
730	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
731	xor	%eax, %eax
732
733	mov	%rax, PBlockLen(%arg2)
734	jmp	_encode_done_\@
735_partial_incomplete_2_\@:
736	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
737_encode_done_\@:
738	movdqu	\AAD_HASH, AadHash(%arg2)
739
740	movdqa	SHUF_MASK(%rip), %xmm10
741	# shuffle xmm9 back to output as ciphertext
742	pshufb	%xmm10, %xmm9
743	pshufb	%xmm2, %xmm9
744.endif
745	# output encrypted Bytes
746	test	%r10, %r10
747	jl	_partial_fill_\@
748	mov	%r13, %r12
749	mov	$16, %r13
750	# Set r13 to be the number of bytes to write out
751	sub	%r12, %r13
752	jmp	_count_set_\@
753_partial_fill_\@:
754	mov	\PLAIN_CYPH_LEN, %r13
755_count_set_\@:
756	movdqa	%xmm9, %xmm0
757	movq	%xmm0, %rax
758	cmp	$8, %r13
759	jle	_less_than_8_bytes_left_\@
760
761	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
762	add	$8, \DATA_OFFSET
763	psrldq	$8, %xmm0
764	movq	%xmm0, %rax
765	sub	$8, %r13
766_less_than_8_bytes_left_\@:
767	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
768	add	$1, \DATA_OFFSET
769	shr	$8, %rax
770	sub	$1, %r13
771	jne	_less_than_8_bytes_left_\@
772_partial_block_done_\@:
773.endm # PARTIAL_BLOCK
774
775/*
776* if a = number of total plaintext bytes
777* b = floor(a/16)
778* num_initial_blocks = b mod 4
779* encrypt the initial num_initial_blocks blocks and apply ghash on
780* the ciphertext
781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
782* are clobbered
783* arg1, %arg2, %arg3 are used as a pointer only, not modified
784*/
785
786
787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
788	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
789	MOVADQ		SHUF_MASK(%rip), %xmm14
790
791	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
792
793	# start AES for num_initial_blocks blocks
794
795	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
796
797.if (\i == 5) || (\i == 6) || (\i == 7)
798
799	MOVADQ		ONE(%RIP),\TMP1
800	MOVADQ		0(%arg1),\TMP2
801.irpc index, \i_seq
802	paddd		\TMP1, \XMM0                 # INCR Y0
803.ifc \operation, dec
804        movdqa     \XMM0, %xmm\index
805.else
806	MOVADQ		\XMM0, %xmm\index
807.endif
808	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
809	pxor		\TMP2, %xmm\index
810.endr
811	lea	0x10(%arg1),%r10
812	mov	keysize,%eax
813	shr	$2,%eax				# 128->4, 192->6, 256->8
814	add	$5,%eax			      # 128->9, 192->11, 256->13
815
816aes_loop_initial_\@:
817	MOVADQ	(%r10),\TMP1
818.irpc	index, \i_seq
819	aesenc	\TMP1, %xmm\index
820.endr
821	add	$16,%r10
822	sub	$1,%eax
823	jnz	aes_loop_initial_\@
824
825	MOVADQ	(%r10), \TMP1
826.irpc index, \i_seq
827	aesenclast \TMP1, %xmm\index         # Last Round
828.endr
829.irpc index, \i_seq
830	movdqu	   (%arg4 , %r11, 1), \TMP1
831	pxor	   \TMP1, %xmm\index
832	movdqu	   %xmm\index, (%arg3 , %r11, 1)
833	# write back plaintext/ciphertext for num_initial_blocks
834	add	   $16, %r11
835
836.ifc \operation, dec
837	movdqa     \TMP1, %xmm\index
838.endif
839	pshufb	   %xmm14, %xmm\index
840
841		# prepare plaintext/ciphertext for GHASH computation
842.endr
843.endif
844
845        # apply GHASH on num_initial_blocks blocks
846
847.if \i == 5
848        pxor       %xmm5, %xmm6
849	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
850        pxor       %xmm6, %xmm7
851	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
852        pxor       %xmm7, %xmm8
853	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854.elseif \i == 6
855        pxor       %xmm6, %xmm7
856	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857        pxor       %xmm7, %xmm8
858	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 7
860        pxor       %xmm7, %xmm8
861	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862.endif
863	cmp	   $64, %r13
864	jl	_initial_blocks_done\@
865	# no need for precomputed values
866/*
867*
868* Precomputations for HashKey parallel with encryption of first 4 blocks.
869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
870*/
871	MOVADQ	   ONE(%RIP),\TMP1
872	paddd	   \TMP1, \XMM0              # INCR Y0
873	MOVADQ	   \XMM0, \XMM1
874	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
875
876	paddd	   \TMP1, \XMM0              # INCR Y0
877	MOVADQ	   \XMM0, \XMM2
878	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
879
880	paddd	   \TMP1, \XMM0              # INCR Y0
881	MOVADQ	   \XMM0, \XMM3
882	pshufb %xmm14, \XMM3        # perform a 16 byte swap
883
884	paddd	   \TMP1, \XMM0              # INCR Y0
885	MOVADQ	   \XMM0, \XMM4
886	pshufb %xmm14, \XMM4        # perform a 16 byte swap
887
888	MOVADQ	   0(%arg1),\TMP1
889	pxor	   \TMP1, \XMM1
890	pxor	   \TMP1, \XMM2
891	pxor	   \TMP1, \XMM3
892	pxor	   \TMP1, \XMM4
893.irpc index, 1234 # do 4 rounds
894	movaps 0x10*\index(%arg1), \TMP1
895	aesenc	   \TMP1, \XMM1
896	aesenc	   \TMP1, \XMM2
897	aesenc	   \TMP1, \XMM3
898	aesenc	   \TMP1, \XMM4
899.endr
900.irpc index, 56789 # do next 5 rounds
901	movaps 0x10*\index(%arg1), \TMP1
902	aesenc	   \TMP1, \XMM1
903	aesenc	   \TMP1, \XMM2
904	aesenc	   \TMP1, \XMM3
905	aesenc	   \TMP1, \XMM4
906.endr
907	lea	   0xa0(%arg1),%r10
908	mov	   keysize,%eax
909	shr	   $2,%eax			# 128->4, 192->6, 256->8
910	sub	   $4,%eax			# 128->0, 192->2, 256->4
911	jz	   aes_loop_pre_done\@
912
913aes_loop_pre_\@:
914	MOVADQ	   (%r10),\TMP2
915.irpc	index, 1234
916	aesenc	   \TMP2, %xmm\index
917.endr
918	add	   $16,%r10
919	sub	   $1,%eax
920	jnz	   aes_loop_pre_\@
921
922aes_loop_pre_done\@:
923	MOVADQ	   (%r10), \TMP2
924	aesenclast \TMP2, \XMM1
925	aesenclast \TMP2, \XMM2
926	aesenclast \TMP2, \XMM3
927	aesenclast \TMP2, \XMM4
928	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
929	pxor	   \TMP1, \XMM1
930.ifc \operation, dec
931	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
932	movdqa     \TMP1, \XMM1
933.endif
934	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
935	pxor	   \TMP1, \XMM2
936.ifc \operation, dec
937	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
938	movdqa     \TMP1, \XMM2
939.endif
940	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
941	pxor	   \TMP1, \XMM3
942.ifc \operation, dec
943	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
944	movdqa     \TMP1, \XMM3
945.endif
946	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
947	pxor	   \TMP1, \XMM4
948.ifc \operation, dec
949	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
950	movdqa     \TMP1, \XMM4
951.else
952	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
953	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
954	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
955	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
956.endif
957
958	add	   $64, %r11
959	pshufb %xmm14, \XMM1 # perform a 16 byte swap
960	pxor	   \XMMDst, \XMM1
961# combine GHASHed value with the corresponding ciphertext
962	pshufb %xmm14, \XMM2 # perform a 16 byte swap
963	pshufb %xmm14, \XMM3 # perform a 16 byte swap
964	pshufb %xmm14, \XMM4 # perform a 16 byte swap
965
966_initial_blocks_done\@:
967
968.endm
969
970/*
971* encrypt 4 blocks at a time
972* ghash the 4 previously encrypted ciphertext blocks
973* arg1, %arg3, %arg4 are used as pointers only, not modified
974* %r11 is the data offset value
975*/
976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
978
979	movdqa	  \XMM1, \XMM5
980	movdqa	  \XMM2, \XMM6
981	movdqa	  \XMM3, \XMM7
982	movdqa	  \XMM4, \XMM8
983
984        movdqa    SHUF_MASK(%rip), %xmm15
985        # multiply TMP5 * HashKey using karatsuba
986
987	movdqa	  \XMM5, \TMP4
988	pshufd	  $78, \XMM5, \TMP6
989	pxor	  \XMM5, \TMP6
990	paddd     ONE(%rip), \XMM0		# INCR CNT
991	movdqu	  HashKey_4(%arg2), \TMP5
992	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
993	movdqa    \XMM0, \XMM1
994	paddd     ONE(%rip), \XMM0		# INCR CNT
995	movdqa    \XMM0, \XMM2
996	paddd     ONE(%rip), \XMM0		# INCR CNT
997	movdqa    \XMM0, \XMM3
998	paddd     ONE(%rip), \XMM0		# INCR CNT
999	movdqa    \XMM0, \XMM4
1000	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1001	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1003	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1004	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1005
1006	pxor	  (%arg1), \XMM1
1007	pxor	  (%arg1), \XMM2
1008	pxor	  (%arg1), \XMM3
1009	pxor	  (%arg1), \XMM4
1010	movdqu	  HashKey_4_k(%arg2), \TMP5
1011	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012	movaps 0x10(%arg1), \TMP1
1013	aesenc	  \TMP1, \XMM1              # Round 1
1014	aesenc	  \TMP1, \XMM2
1015	aesenc	  \TMP1, \XMM3
1016	aesenc	  \TMP1, \XMM4
1017	movaps 0x20(%arg1), \TMP1
1018	aesenc	  \TMP1, \XMM1              # Round 2
1019	aesenc	  \TMP1, \XMM2
1020	aesenc	  \TMP1, \XMM3
1021	aesenc	  \TMP1, \XMM4
1022	movdqa	  \XMM6, \TMP1
1023	pshufd	  $78, \XMM6, \TMP2
1024	pxor	  \XMM6, \TMP2
1025	movdqu	  HashKey_3(%arg2), \TMP5
1026	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027	movaps 0x30(%arg1), \TMP3
1028	aesenc    \TMP3, \XMM1              # Round 3
1029	aesenc    \TMP3, \XMM2
1030	aesenc    \TMP3, \XMM3
1031	aesenc    \TMP3, \XMM4
1032	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033	movaps 0x40(%arg1), \TMP3
1034	aesenc	  \TMP3, \XMM1              # Round 4
1035	aesenc	  \TMP3, \XMM2
1036	aesenc	  \TMP3, \XMM3
1037	aesenc	  \TMP3, \XMM4
1038	movdqu	  HashKey_3_k(%arg2), \TMP5
1039	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040	movaps 0x50(%arg1), \TMP3
1041	aesenc	  \TMP3, \XMM1              # Round 5
1042	aesenc	  \TMP3, \XMM2
1043	aesenc	  \TMP3, \XMM3
1044	aesenc	  \TMP3, \XMM4
1045	pxor	  \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047	pxor	  \XMM6, \XMM5
1048	pxor	  \TMP2, \TMP6
1049	movdqa	  \XMM7, \TMP1
1050	pshufd	  $78, \XMM7, \TMP2
1051	pxor	  \XMM7, \TMP2
1052	movdqu	  HashKey_2(%arg2), \TMP5
1053
1054        # Multiply TMP5 * HashKey using karatsuba
1055
1056	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057	movaps 0x60(%arg1), \TMP3
1058	aesenc	  \TMP3, \XMM1              # Round 6
1059	aesenc	  \TMP3, \XMM2
1060	aesenc	  \TMP3, \XMM3
1061	aesenc	  \TMP3, \XMM4
1062	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063	movaps 0x70(%arg1), \TMP3
1064	aesenc	  \TMP3, \XMM1              # Round 7
1065	aesenc	  \TMP3, \XMM2
1066	aesenc	  \TMP3, \XMM3
1067	aesenc	  \TMP3, \XMM4
1068	movdqu	  HashKey_2_k(%arg2), \TMP5
1069	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070	movaps 0x80(%arg1), \TMP3
1071	aesenc	  \TMP3, \XMM1              # Round 8
1072	aesenc	  \TMP3, \XMM2
1073	aesenc	  \TMP3, \XMM3
1074	aesenc	  \TMP3, \XMM4
1075	pxor	  \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077	pxor	  \XMM7, \XMM5
1078	pxor	  \TMP2, \TMP6
1079
1080        # Multiply XMM8 * HashKey
1081        # XMM8 and TMP5 hold the values for the two operands
1082
1083	movdqa	  \XMM8, \TMP1
1084	pshufd	  $78, \XMM8, \TMP2
1085	pxor	  \XMM8, \TMP2
1086	movdqu	  HashKey(%arg2), \TMP5
1087	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088	movaps 0x90(%arg1), \TMP3
1089	aesenc	  \TMP3, \XMM1             # Round 9
1090	aesenc	  \TMP3, \XMM2
1091	aesenc	  \TMP3, \XMM3
1092	aesenc	  \TMP3, \XMM4
1093	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094	lea	  0xa0(%arg1),%r10
1095	mov	  keysize,%eax
1096	shr	  $2,%eax			# 128->4, 192->6, 256->8
1097	sub	  $4,%eax			# 128->0, 192->2, 256->4
1098	jz	  aes_loop_par_enc_done\@
1099
1100aes_loop_par_enc\@:
1101	MOVADQ	  (%r10),\TMP3
1102.irpc	index, 1234
1103	aesenc	  \TMP3, %xmm\index
1104.endr
1105	add	  $16,%r10
1106	sub	  $1,%eax
1107	jnz	  aes_loop_par_enc\@
1108
1109aes_loop_par_enc_done\@:
1110	MOVADQ	  (%r10), \TMP3
1111	aesenclast \TMP3, \XMM1           # Round 10
1112	aesenclast \TMP3, \XMM2
1113	aesenclast \TMP3, \XMM3
1114	aesenclast \TMP3, \XMM4
1115	movdqu    HashKey_k(%arg2), \TMP5
1116	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117	movdqu	  (%arg4,%r11,1), \TMP3
1118	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119	movdqu	  16(%arg4,%r11,1), \TMP3
1120	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121	movdqu	  32(%arg4,%r11,1), \TMP3
1122	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123	movdqu	  48(%arg4,%r11,1), \TMP3
1124	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1131	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1132	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1133
1134	pxor	  \TMP4, \TMP1
1135	pxor	  \XMM8, \XMM5
1136	pxor	  \TMP6, \TMP2
1137	pxor	  \TMP1, \TMP2
1138	pxor	  \XMM5, \TMP2
1139	movdqa	  \TMP2, \TMP3
1140	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1141	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1142	pxor	  \TMP3, \XMM5
1143	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1144
1145        # first phase of reduction
1146
1147	movdqa    \XMM5, \TMP2
1148	movdqa    \XMM5, \TMP3
1149	movdqa    \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151	pslld     $31, \TMP2                   # packed right shift << 31
1152	pslld     $30, \TMP3                   # packed right shift << 30
1153	pslld     $25, \TMP4                   # packed right shift << 25
1154	pxor      \TMP3, \TMP2	               # xor the shifted versions
1155	pxor      \TMP4, \TMP2
1156	movdqa    \TMP2, \TMP5
1157	psrldq    $4, \TMP5                    # right shift T5 1 DW
1158	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159	pxor      \TMP2, \XMM5
1160
1161        # second phase of reduction
1162
1163	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164	movdqa    \XMM5,\TMP3
1165	movdqa    \XMM5,\TMP4
1166	psrld     $1, \TMP2                    # packed left shift >>1
1167	psrld     $2, \TMP3                    # packed left shift >>2
1168	psrld     $7, \TMP4                    # packed left shift >>7
1169	pxor      \TMP3,\TMP2		       # xor the shifted versions
1170	pxor      \TMP4,\TMP2
1171	pxor      \TMP5, \TMP2
1172	pxor      \TMP2, \XMM5
1173	pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175	pxor	  \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
1181* arg1, %arg3, %arg4 are used as pointers only, not modified
1182* %r11 is the data offset value
1183*/
1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187	movdqa	  \XMM1, \XMM5
1188	movdqa	  \XMM2, \XMM6
1189	movdqa	  \XMM3, \XMM7
1190	movdqa	  \XMM4, \XMM8
1191
1192        movdqa    SHUF_MASK(%rip), %xmm15
1193        # multiply TMP5 * HashKey using karatsuba
1194
1195	movdqa	  \XMM5, \TMP4
1196	pshufd	  $78, \XMM5, \TMP6
1197	pxor	  \XMM5, \TMP6
1198	paddd     ONE(%rip), \XMM0		# INCR CNT
1199	movdqu	  HashKey_4(%arg2), \TMP5
1200	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201	movdqa    \XMM0, \XMM1
1202	paddd     ONE(%rip), \XMM0		# INCR CNT
1203	movdqa    \XMM0, \XMM2
1204	paddd     ONE(%rip), \XMM0		# INCR CNT
1205	movdqa    \XMM0, \XMM3
1206	paddd     ONE(%rip), \XMM0		# INCR CNT
1207	movdqa    \XMM0, \XMM4
1208	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1209	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1211	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1212	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1213
1214	pxor	  (%arg1), \XMM1
1215	pxor	  (%arg1), \XMM2
1216	pxor	  (%arg1), \XMM3
1217	pxor	  (%arg1), \XMM4
1218	movdqu	  HashKey_4_k(%arg2), \TMP5
1219	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220	movaps 0x10(%arg1), \TMP1
1221	aesenc	  \TMP1, \XMM1              # Round 1
1222	aesenc	  \TMP1, \XMM2
1223	aesenc	  \TMP1, \XMM3
1224	aesenc	  \TMP1, \XMM4
1225	movaps 0x20(%arg1), \TMP1
1226	aesenc	  \TMP1, \XMM1              # Round 2
1227	aesenc	  \TMP1, \XMM2
1228	aesenc	  \TMP1, \XMM3
1229	aesenc	  \TMP1, \XMM4
1230	movdqa	  \XMM6, \TMP1
1231	pshufd	  $78, \XMM6, \TMP2
1232	pxor	  \XMM6, \TMP2
1233	movdqu	  HashKey_3(%arg2), \TMP5
1234	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235	movaps 0x30(%arg1), \TMP3
1236	aesenc    \TMP3, \XMM1              # Round 3
1237	aesenc    \TMP3, \XMM2
1238	aesenc    \TMP3, \XMM3
1239	aesenc    \TMP3, \XMM4
1240	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241	movaps 0x40(%arg1), \TMP3
1242	aesenc	  \TMP3, \XMM1              # Round 4
1243	aesenc	  \TMP3, \XMM2
1244	aesenc	  \TMP3, \XMM3
1245	aesenc	  \TMP3, \XMM4
1246	movdqu	  HashKey_3_k(%arg2), \TMP5
1247	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248	movaps 0x50(%arg1), \TMP3
1249	aesenc	  \TMP3, \XMM1              # Round 5
1250	aesenc	  \TMP3, \XMM2
1251	aesenc	  \TMP3, \XMM3
1252	aesenc	  \TMP3, \XMM4
1253	pxor	  \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255	pxor	  \XMM6, \XMM5
1256	pxor	  \TMP2, \TMP6
1257	movdqa	  \XMM7, \TMP1
1258	pshufd	  $78, \XMM7, \TMP2
1259	pxor	  \XMM7, \TMP2
1260	movdqu	  HashKey_2(%arg2), \TMP5
1261
1262        # Multiply TMP5 * HashKey using karatsuba
1263
1264	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265	movaps 0x60(%arg1), \TMP3
1266	aesenc	  \TMP3, \XMM1              # Round 6
1267	aesenc	  \TMP3, \XMM2
1268	aesenc	  \TMP3, \XMM3
1269	aesenc	  \TMP3, \XMM4
1270	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271	movaps 0x70(%arg1), \TMP3
1272	aesenc	  \TMP3, \XMM1              # Round 7
1273	aesenc	  \TMP3, \XMM2
1274	aesenc	  \TMP3, \XMM3
1275	aesenc	  \TMP3, \XMM4
1276	movdqu	  HashKey_2_k(%arg2), \TMP5
1277	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278	movaps 0x80(%arg1), \TMP3
1279	aesenc	  \TMP3, \XMM1              # Round 8
1280	aesenc	  \TMP3, \XMM2
1281	aesenc	  \TMP3, \XMM3
1282	aesenc	  \TMP3, \XMM4
1283	pxor	  \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285	pxor	  \XMM7, \XMM5
1286	pxor	  \TMP2, \TMP6
1287
1288        # Multiply XMM8 * HashKey
1289        # XMM8 and TMP5 hold the values for the two operands
1290
1291	movdqa	  \XMM8, \TMP1
1292	pshufd	  $78, \XMM8, \TMP2
1293	pxor	  \XMM8, \TMP2
1294	movdqu	  HashKey(%arg2), \TMP5
1295	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296	movaps 0x90(%arg1), \TMP3
1297	aesenc	  \TMP3, \XMM1             # Round 9
1298	aesenc	  \TMP3, \XMM2
1299	aesenc	  \TMP3, \XMM3
1300	aesenc	  \TMP3, \XMM4
1301	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302	lea	  0xa0(%arg1),%r10
1303	mov	  keysize,%eax
1304	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1305	sub	  $4,%eax			# 128->0, 192->2, 256->4
1306	jz	  aes_loop_par_dec_done\@
1307
1308aes_loop_par_dec\@:
1309	MOVADQ	  (%r10),\TMP3
1310.irpc	index, 1234
1311	aesenc	  \TMP3, %xmm\index
1312.endr
1313	add	  $16,%r10
1314	sub	  $1,%eax
1315	jnz	  aes_loop_par_dec\@
1316
1317aes_loop_par_dec_done\@:
1318	MOVADQ	  (%r10), \TMP3
1319	aesenclast \TMP3, \XMM1           # last round
1320	aesenclast \TMP3, \XMM2
1321	aesenclast \TMP3, \XMM3
1322	aesenclast \TMP3, \XMM4
1323	movdqu    HashKey_k(%arg2), \TMP5
1324	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325	movdqu	  (%arg4,%r11,1), \TMP3
1326	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328	movdqa    \TMP3, \XMM1
1329	movdqu	  16(%arg4,%r11,1), \TMP3
1330	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332	movdqa    \TMP3, \XMM2
1333	movdqu	  32(%arg4,%r11,1), \TMP3
1334	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336	movdqa    \TMP3, \XMM3
1337	movdqu	  48(%arg4,%r11,1), \TMP3
1338	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340	movdqa    \TMP3, \XMM4
1341	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1343	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1344	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1345
1346	pxor	  \TMP4, \TMP1
1347	pxor	  \XMM8, \XMM5
1348	pxor	  \TMP6, \TMP2
1349	pxor	  \TMP1, \TMP2
1350	pxor	  \XMM5, \TMP2
1351	movdqa	  \TMP2, \TMP3
1352	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1353	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1354	pxor	  \TMP3, \XMM5
1355	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1356
1357        # first phase of reduction
1358
1359	movdqa    \XMM5, \TMP2
1360	movdqa    \XMM5, \TMP3
1361	movdqa    \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363	pslld     $31, \TMP2                   # packed right shift << 31
1364	pslld     $30, \TMP3                   # packed right shift << 30
1365	pslld     $25, \TMP4                   # packed right shift << 25
1366	pxor      \TMP3, \TMP2	               # xor the shifted versions
1367	pxor      \TMP4, \TMP2
1368	movdqa    \TMP2, \TMP5
1369	psrldq    $4, \TMP5                    # right shift T5 1 DW
1370	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371	pxor      \TMP2, \XMM5
1372
1373        # second phase of reduction
1374
1375	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376	movdqa    \XMM5,\TMP3
1377	movdqa    \XMM5,\TMP4
1378	psrld     $1, \TMP2                    # packed left shift >>1
1379	psrld     $2, \TMP3                    # packed left shift >>2
1380	psrld     $7, \TMP4                    # packed left shift >>7
1381	pxor      \TMP3,\TMP2		       # xor the shifted versions
1382	pxor      \TMP4,\TMP2
1383	pxor      \TMP5, \TMP2
1384	pxor      \TMP2, \XMM5
1385	pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387	pxor	  \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394        # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396	movdqa	  \XMM1, \TMP6
1397	pshufd	  $78, \XMM1, \TMP2
1398	pxor	  \XMM1, \TMP2
1399	movdqu	  HashKey_4(%arg2), \TMP5
1400	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402	movdqu	  HashKey_4_k(%arg2), \TMP4
1403	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404	movdqa	  \XMM1, \XMMDst
1405	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407        # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409	movdqa	  \XMM2, \TMP1
1410	pshufd	  $78, \XMM2, \TMP2
1411	pxor	  \XMM2, \TMP2
1412	movdqu	  HashKey_3(%arg2), \TMP5
1413	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415	movdqu	  HashKey_3_k(%arg2), \TMP4
1416	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417	pxor	  \TMP1, \TMP6
1418	pxor	  \XMM2, \XMMDst
1419	pxor	  \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422        # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424	movdqa	  \XMM3, \TMP1
1425	pshufd	  $78, \XMM3, \TMP2
1426	pxor	  \XMM3, \TMP2
1427	movdqu	  HashKey_2(%arg2), \TMP5
1428	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430	movdqu	  HashKey_2_k(%arg2), \TMP4
1431	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432	pxor	  \TMP1, \TMP6
1433	pxor	  \XMM3, \XMMDst
1434	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436        # Multiply TMP1 * HashKey (using Karatsuba)
1437	movdqa	  \XMM4, \TMP1
1438	pshufd	  $78, \XMM4, \TMP2
1439	pxor	  \XMM4, \TMP2
1440	movdqu	  HashKey(%arg2), \TMP5
1441	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1442	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443	movdqu	  HashKey_k(%arg2), \TMP4
1444	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445	pxor	  \TMP1, \TMP6
1446	pxor	  \XMM4, \XMMDst
1447	pxor	  \XMM1, \TMP2
1448	pxor	  \TMP6, \TMP2
1449	pxor	  \XMMDst, \TMP2
1450	# middle section of the temp results combined as in karatsuba algorithm
1451	movdqa	  \TMP2, \TMP4
1452	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1453	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1454	pxor	  \TMP4, \XMMDst
1455	pxor	  \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457	# first phase of the reduction
1458	movdqa    \XMMDst, \TMP2
1459	movdqa    \XMMDst, \TMP3
1460	movdqa    \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462	pslld     $31, \TMP2                # packed right shifting << 31
1463	pslld     $30, \TMP3                # packed right shifting << 30
1464	pslld     $25, \TMP4                # packed right shifting << 25
1465	pxor      \TMP3, \TMP2              # xor the shifted versions
1466	pxor      \TMP4, \TMP2
1467	movdqa    \TMP2, \TMP7
1468	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470	pxor      \TMP2, \XMMDst
1471
1472        # second phase of the reduction
1473	movdqa    \XMMDst, \TMP2
1474	# make 3 copies of XMMDst for doing 3 shift operations
1475	movdqa    \XMMDst, \TMP3
1476	movdqa    \XMMDst, \TMP4
1477	psrld     $1, \TMP2                 # packed left shift >> 1
1478	psrld     $2, \TMP3                 # packed left shift >> 2
1479	psrld     $7, \TMP4                 # packed left shift >> 7
1480	pxor      \TMP3, \TMP2              # xor the shifted versions
1481	pxor      \TMP4, \TMP2
1482	pxor      \TMP7, \TMP2
1483	pxor      \TMP2, \XMMDst
1484	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485.endm
1486
1487
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
1491
1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494	pxor		(%arg1), \XMM0
1495	mov		keysize,%eax
1496	shr		$2,%eax			# 128->4, 192->6, 256->8
1497	add		$5,%eax			# 128->9, 192->11, 256->13
1498	lea		16(%arg1), %r10	  # get first expanded key address
1499
1500_esb_loop_\@:
1501	MOVADQ		(%r10),\TMP1
1502	aesenc		\TMP1,\XMM0
1503	add		$16,%r10
1504	sub		$1,%eax
1505	jnz		_esb_loop_\@
1506
1507	MOVADQ		(%r10),\TMP1
1508	aesenclast	\TMP1,\XMM0
1509.endm
1510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512*                   struct gcm_context_data *data
1513*                                      // Context data
1514*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515*                   const u8 *in,      // Ciphertext input
1516*                   u64 plaintext_len, // Length of data in bytes for decryption.
1517*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521*                   const u8 *aad,     // Additional Authentication Data (AAD)
1522*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524*                                      // given authentication tag and only return the plaintext if they match.
1525*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526*                                      // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532*       set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535*       0                   1                   2                   3
1536*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538*       |                             Salt  (From the SA)               |
1539*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540*       |                     Initialization Vector                     |
1541*       |         (This is the sequence number from IPSec header)       |
1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543*       |                              0x1                              |
1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549*       AAD padded to 128 bits with 0
1550*       for example, assume AAD is a u32 vector
1551*
1552*       if AAD is 8 bytes:
1553*       AAD[3] = {A0, A1};
1554*       padded AAD in xmm register = {A1 A0 0 0}
1555*
1556*       0                   1                   2                   3
1557*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559*       |                               SPI (A1)                        |
1560*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561*       |                     32-bit Sequence Number (A0)               |
1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*       |                              0x0                              |
1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566*                                       AAD Format with 32-bit Sequence Number
1567*
1568*       if AAD is 12 bytes:
1569*       AAD[3] = {A0, A1, A2};
1570*       padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572*       0                   1                   2                   3
1573*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577*       |                               SPI (A2)                        |
1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*       |                 64-bit Extended Sequence Number {A1,A0}       |
1580*       |                                                               |
1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582*       |                              0x0                              |
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585*                        AAD Format with 64-bit Extended Sequence Number
1586*
1587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
1590SYM_FUNC_START(aesni_gcm_dec)
1591	FUNC_SAVE
1592
1593	GCM_INIT %arg6, arg7, arg8, arg9
1594	GCM_ENC_DEC dec
1595	GCM_COMPLETE arg10, arg11
1596	FUNC_RESTORE
1597	ret
1598SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603*                    struct gcm_context_data *data
1604*                                        // Context data
1605*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606*                    const u8 *in,       // Plaintext input
1607*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612*                    const u8 *aad,      // Additional Authentication Data (AAD)
1613*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614*                    u8 *auth_tag,       // Authenticated Tag output.
1615*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616*                                        // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621*       keys are pre-expanded and aligned to 16 bytes. we are using the
1622*       first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626*       0                   1                   2                   3
1627*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629*       |                             Salt  (From the SA)               |
1630*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631*       |                     Initialization Vector                     |
1632*       |         (This is the sequence number from IPSec header)       |
1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634*       |                              0x1                              |
1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640*       AAD padded to 128 bits with 0
1641*       for example, assume AAD is a u32 vector
1642*
1643*       if AAD is 8 bytes:
1644*       AAD[3] = {A0, A1};
1645*       padded AAD in xmm register = {A1 A0 0 0}
1646*
1647*       0                   1                   2                   3
1648*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650*       |                               SPI (A1)                        |
1651*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652*       |                     32-bit Sequence Number (A0)               |
1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*       |                              0x0                              |
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657*                                 AAD Format with 32-bit Sequence Number
1658*
1659*       if AAD is 12 bytes:
1660*       AAD[3] = {A0, A1, A2};
1661*       padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663*       0                   1                   2                   3
1664*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666*       |                               SPI (A2)                        |
1667*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668*       |                 64-bit Extended Sequence Number {A1,A0}       |
1669*       |                                                               |
1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671*       |                              0x0                              |
1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674*                         AAD Format with 64-bit Extended Sequence Number
1675*
1676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
1678SYM_FUNC_START(aesni_gcm_enc)
1679	FUNC_SAVE
1680
1681	GCM_INIT %arg6, arg7, arg8, arg9
1682	GCM_ENC_DEC enc
1683
1684	GCM_COMPLETE arg10, arg11
1685	FUNC_RESTORE
1686	ret
1687SYM_FUNC_END(aesni_gcm_enc)
1688
1689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691*                     struct gcm_context_data *data,
1692*                                         // context data
1693*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697*                     const u8 *aad,      // Additional Authentication Data (AAD)
1698*                     u64 aad_len)        // Length of AAD in bytes.
1699*/
1700SYM_FUNC_START(aesni_gcm_init)
1701	FUNC_SAVE
1702	GCM_INIT %arg3, %arg4,%arg5, %arg6
1703	FUNC_RESTORE
1704	ret
1705SYM_FUNC_END(aesni_gcm_init)
1706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709*                    struct gcm_context_data *data,
1710*                                        // context data
1711*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712*                    const u8 *in,       // Plaintext input
1713*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714*/
1715SYM_FUNC_START(aesni_gcm_enc_update)
1716	FUNC_SAVE
1717	GCM_ENC_DEC enc
1718	FUNC_RESTORE
1719	ret
1720SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724*                    struct gcm_context_data *data,
1725*                                        // context data
1726*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727*                    const u8 *in,       // Plaintext input
1728*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729*/
1730SYM_FUNC_START(aesni_gcm_dec_update)
1731	FUNC_SAVE
1732	GCM_ENC_DEC dec
1733	FUNC_RESTORE
1734	ret
1735SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739*                    struct gcm_context_data *data,
1740*                                        // context data
1741*                    u8 *auth_tag,       // Authenticated Tag output.
1742*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743*                                        // 12 or 8.
1744*/
1745SYM_FUNC_START(aesni_gcm_finalize)
1746	FUNC_SAVE
1747	GCM_COMPLETE %arg3 %arg4
1748	FUNC_RESTORE
1749	ret
1750SYM_FUNC_END(aesni_gcm_finalize)
1751
1752#endif
1753
1754
1755SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1756SYM_FUNC_START_LOCAL(_key_expansion_256a)
1757	pshufd $0b11111111, %xmm1, %xmm1
1758	shufps $0b00010000, %xmm0, %xmm4
1759	pxor %xmm4, %xmm0
1760	shufps $0b10001100, %xmm0, %xmm4
1761	pxor %xmm4, %xmm0
1762	pxor %xmm1, %xmm0
1763	movaps %xmm0, (TKEYP)
1764	add $0x10, TKEYP
1765	ret
1766SYM_FUNC_END(_key_expansion_256a)
1767SYM_FUNC_END_ALIAS(_key_expansion_128)
1768
1769SYM_FUNC_START_LOCAL(_key_expansion_192a)
1770	pshufd $0b01010101, %xmm1, %xmm1
1771	shufps $0b00010000, %xmm0, %xmm4
1772	pxor %xmm4, %xmm0
1773	shufps $0b10001100, %xmm0, %xmm4
1774	pxor %xmm4, %xmm0
1775	pxor %xmm1, %xmm0
1776
1777	movaps %xmm2, %xmm5
1778	movaps %xmm2, %xmm6
1779	pslldq $4, %xmm5
1780	pshufd $0b11111111, %xmm0, %xmm3
1781	pxor %xmm3, %xmm2
1782	pxor %xmm5, %xmm2
1783
1784	movaps %xmm0, %xmm1
1785	shufps $0b01000100, %xmm0, %xmm6
1786	movaps %xmm6, (TKEYP)
1787	shufps $0b01001110, %xmm2, %xmm1
1788	movaps %xmm1, 0x10(TKEYP)
1789	add $0x20, TKEYP
1790	ret
1791SYM_FUNC_END(_key_expansion_192a)
1792
1793SYM_FUNC_START_LOCAL(_key_expansion_192b)
1794	pshufd $0b01010101, %xmm1, %xmm1
1795	shufps $0b00010000, %xmm0, %xmm4
1796	pxor %xmm4, %xmm0
1797	shufps $0b10001100, %xmm0, %xmm4
1798	pxor %xmm4, %xmm0
1799	pxor %xmm1, %xmm0
1800
1801	movaps %xmm2, %xmm5
1802	pslldq $4, %xmm5
1803	pshufd $0b11111111, %xmm0, %xmm3
1804	pxor %xmm3, %xmm2
1805	pxor %xmm5, %xmm2
1806
1807	movaps %xmm0, (TKEYP)
1808	add $0x10, TKEYP
1809	ret
1810SYM_FUNC_END(_key_expansion_192b)
1811
1812SYM_FUNC_START_LOCAL(_key_expansion_256b)
1813	pshufd $0b10101010, %xmm1, %xmm1
1814	shufps $0b00010000, %xmm2, %xmm4
1815	pxor %xmm4, %xmm2
1816	shufps $0b10001100, %xmm2, %xmm4
1817	pxor %xmm4, %xmm2
1818	pxor %xmm1, %xmm2
1819	movaps %xmm2, (TKEYP)
1820	add $0x10, TKEYP
1821	ret
1822SYM_FUNC_END(_key_expansion_256b)
1823
1824/*
1825 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1826 *                   unsigned int key_len)
1827 */
1828SYM_FUNC_START(aesni_set_key)
1829	FRAME_BEGIN
1830#ifndef __x86_64__
1831	pushl KEYP
1832	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1833	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1834	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1835#endif
1836	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1837	movaps %xmm0, (KEYP)
1838	lea 0x10(KEYP), TKEYP		# key addr
1839	movl %edx, 480(KEYP)
1840	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1841	cmp $24, %dl
1842	jb .Lenc_key128
1843	je .Lenc_key192
1844	movups 0x10(UKEYP), %xmm2	# other user key
1845	movaps %xmm2, (TKEYP)
1846	add $0x10, TKEYP
1847	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1848	call _key_expansion_256a
1849	aeskeygenassist $0x1, %xmm0, %xmm1
1850	call _key_expansion_256b
1851	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1852	call _key_expansion_256a
1853	aeskeygenassist $0x2, %xmm0, %xmm1
1854	call _key_expansion_256b
1855	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1856	call _key_expansion_256a
1857	aeskeygenassist $0x4, %xmm0, %xmm1
1858	call _key_expansion_256b
1859	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1860	call _key_expansion_256a
1861	aeskeygenassist $0x8, %xmm0, %xmm1
1862	call _key_expansion_256b
1863	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1864	call _key_expansion_256a
1865	aeskeygenassist $0x10, %xmm0, %xmm1
1866	call _key_expansion_256b
1867	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1868	call _key_expansion_256a
1869	aeskeygenassist $0x20, %xmm0, %xmm1
1870	call _key_expansion_256b
1871	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1872	call _key_expansion_256a
1873	jmp .Ldec_key
1874.Lenc_key192:
1875	movq 0x10(UKEYP), %xmm2		# other user key
1876	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1877	call _key_expansion_192a
1878	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1879	call _key_expansion_192b
1880	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1881	call _key_expansion_192a
1882	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1883	call _key_expansion_192b
1884	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1885	call _key_expansion_192a
1886	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1887	call _key_expansion_192b
1888	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1889	call _key_expansion_192a
1890	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1891	call _key_expansion_192b
1892	jmp .Ldec_key
1893.Lenc_key128:
1894	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1895	call _key_expansion_128
1896	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1897	call _key_expansion_128
1898	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1899	call _key_expansion_128
1900	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1901	call _key_expansion_128
1902	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1903	call _key_expansion_128
1904	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1905	call _key_expansion_128
1906	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1907	call _key_expansion_128
1908	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1909	call _key_expansion_128
1910	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1911	call _key_expansion_128
1912	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1913	call _key_expansion_128
1914.Ldec_key:
1915	sub $0x10, TKEYP
1916	movaps (KEYP), %xmm0
1917	movaps (TKEYP), %xmm1
1918	movaps %xmm0, 240(TKEYP)
1919	movaps %xmm1, 240(KEYP)
1920	add $0x10, KEYP
1921	lea 240-16(TKEYP), UKEYP
1922.align 4
1923.Ldec_key_loop:
1924	movaps (KEYP), %xmm0
1925	aesimc %xmm0, %xmm1
1926	movaps %xmm1, (UKEYP)
1927	add $0x10, KEYP
1928	sub $0x10, UKEYP
1929	cmp TKEYP, KEYP
1930	jb .Ldec_key_loop
1931	xor AREG, AREG
1932#ifndef __x86_64__
1933	popl KEYP
1934#endif
1935	FRAME_END
1936	ret
1937SYM_FUNC_END(aesni_set_key)
1938
1939/*
1940 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1941 */
1942SYM_FUNC_START(aesni_enc)
1943	FRAME_BEGIN
1944#ifndef __x86_64__
1945	pushl KEYP
1946	pushl KLEN
1947	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1948	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1949	movl (FRAME_OFFSET+20)(%esp), INP	# src
1950#endif
1951	movl 480(KEYP), KLEN		# key length
1952	movups (INP), STATE		# input
1953	call _aesni_enc1
1954	movups STATE, (OUTP)		# output
1955#ifndef __x86_64__
1956	popl KLEN
1957	popl KEYP
1958#endif
1959	FRAME_END
1960	ret
1961SYM_FUNC_END(aesni_enc)
1962
1963/*
1964 * _aesni_enc1:		internal ABI
1965 * input:
1966 *	KEYP:		key struct pointer
1967 *	KLEN:		round count
1968 *	STATE:		initial state (input)
1969 * output:
1970 *	STATE:		finial state (output)
1971 * changed:
1972 *	KEY
1973 *	TKEYP (T1)
1974 */
1975SYM_FUNC_START_LOCAL(_aesni_enc1)
1976	movaps (KEYP), KEY		# key
1977	mov KEYP, TKEYP
1978	pxor KEY, STATE		# round 0
1979	add $0x30, TKEYP
1980	cmp $24, KLEN
1981	jb .Lenc128
1982	lea 0x20(TKEYP), TKEYP
1983	je .Lenc192
1984	add $0x20, TKEYP
1985	movaps -0x60(TKEYP), KEY
1986	aesenc KEY, STATE
1987	movaps -0x50(TKEYP), KEY
1988	aesenc KEY, STATE
1989.align 4
1990.Lenc192:
1991	movaps -0x40(TKEYP), KEY
1992	aesenc KEY, STATE
1993	movaps -0x30(TKEYP), KEY
1994	aesenc KEY, STATE
1995.align 4
1996.Lenc128:
1997	movaps -0x20(TKEYP), KEY
1998	aesenc KEY, STATE
1999	movaps -0x10(TKEYP), KEY
2000	aesenc KEY, STATE
2001	movaps (TKEYP), KEY
2002	aesenc KEY, STATE
2003	movaps 0x10(TKEYP), KEY
2004	aesenc KEY, STATE
2005	movaps 0x20(TKEYP), KEY
2006	aesenc KEY, STATE
2007	movaps 0x30(TKEYP), KEY
2008	aesenc KEY, STATE
2009	movaps 0x40(TKEYP), KEY
2010	aesenc KEY, STATE
2011	movaps 0x50(TKEYP), KEY
2012	aesenc KEY, STATE
2013	movaps 0x60(TKEYP), KEY
2014	aesenc KEY, STATE
2015	movaps 0x70(TKEYP), KEY
2016	aesenclast KEY, STATE
2017	ret
2018SYM_FUNC_END(_aesni_enc1)
2019
2020/*
2021 * _aesni_enc4:	internal ABI
2022 * input:
2023 *	KEYP:		key struct pointer
2024 *	KLEN:		round count
2025 *	STATE1:		initial state (input)
2026 *	STATE2
2027 *	STATE3
2028 *	STATE4
2029 * output:
2030 *	STATE1:		finial state (output)
2031 *	STATE2
2032 *	STATE3
2033 *	STATE4
2034 * changed:
2035 *	KEY
2036 *	TKEYP (T1)
2037 */
2038SYM_FUNC_START_LOCAL(_aesni_enc4)
2039	movaps (KEYP), KEY		# key
2040	mov KEYP, TKEYP
2041	pxor KEY, STATE1		# round 0
2042	pxor KEY, STATE2
2043	pxor KEY, STATE3
2044	pxor KEY, STATE4
2045	add $0x30, TKEYP
2046	cmp $24, KLEN
2047	jb .L4enc128
2048	lea 0x20(TKEYP), TKEYP
2049	je .L4enc192
2050	add $0x20, TKEYP
2051	movaps -0x60(TKEYP), KEY
2052	aesenc KEY, STATE1
2053	aesenc KEY, STATE2
2054	aesenc KEY, STATE3
2055	aesenc KEY, STATE4
2056	movaps -0x50(TKEYP), KEY
2057	aesenc KEY, STATE1
2058	aesenc KEY, STATE2
2059	aesenc KEY, STATE3
2060	aesenc KEY, STATE4
2061#.align 4
2062.L4enc192:
2063	movaps -0x40(TKEYP), KEY
2064	aesenc KEY, STATE1
2065	aesenc KEY, STATE2
2066	aesenc KEY, STATE3
2067	aesenc KEY, STATE4
2068	movaps -0x30(TKEYP), KEY
2069	aesenc KEY, STATE1
2070	aesenc KEY, STATE2
2071	aesenc KEY, STATE3
2072	aesenc KEY, STATE4
2073#.align 4
2074.L4enc128:
2075	movaps -0x20(TKEYP), KEY
2076	aesenc KEY, STATE1
2077	aesenc KEY, STATE2
2078	aesenc KEY, STATE3
2079	aesenc KEY, STATE4
2080	movaps -0x10(TKEYP), KEY
2081	aesenc KEY, STATE1
2082	aesenc KEY, STATE2
2083	aesenc KEY, STATE3
2084	aesenc KEY, STATE4
2085	movaps (TKEYP), KEY
2086	aesenc KEY, STATE1
2087	aesenc KEY, STATE2
2088	aesenc KEY, STATE3
2089	aesenc KEY, STATE4
2090	movaps 0x10(TKEYP), KEY
2091	aesenc KEY, STATE1
2092	aesenc KEY, STATE2
2093	aesenc KEY, STATE3
2094	aesenc KEY, STATE4
2095	movaps 0x20(TKEYP), KEY
2096	aesenc KEY, STATE1
2097	aesenc KEY, STATE2
2098	aesenc KEY, STATE3
2099	aesenc KEY, STATE4
2100	movaps 0x30(TKEYP), KEY
2101	aesenc KEY, STATE1
2102	aesenc KEY, STATE2
2103	aesenc KEY, STATE3
2104	aesenc KEY, STATE4
2105	movaps 0x40(TKEYP), KEY
2106	aesenc KEY, STATE1
2107	aesenc KEY, STATE2
2108	aesenc KEY, STATE3
2109	aesenc KEY, STATE4
2110	movaps 0x50(TKEYP), KEY
2111	aesenc KEY, STATE1
2112	aesenc KEY, STATE2
2113	aesenc KEY, STATE3
2114	aesenc KEY, STATE4
2115	movaps 0x60(TKEYP), KEY
2116	aesenc KEY, STATE1
2117	aesenc KEY, STATE2
2118	aesenc KEY, STATE3
2119	aesenc KEY, STATE4
2120	movaps 0x70(TKEYP), KEY
2121	aesenclast KEY, STATE1		# last round
2122	aesenclast KEY, STATE2
2123	aesenclast KEY, STATE3
2124	aesenclast KEY, STATE4
2125	ret
2126SYM_FUNC_END(_aesni_enc4)
2127
2128/*
2129 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2130 */
2131SYM_FUNC_START(aesni_dec)
2132	FRAME_BEGIN
2133#ifndef __x86_64__
2134	pushl KEYP
2135	pushl KLEN
2136	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2137	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2138	movl (FRAME_OFFSET+20)(%esp), INP	# src
2139#endif
2140	mov 480(KEYP), KLEN		# key length
2141	add $240, KEYP
2142	movups (INP), STATE		# input
2143	call _aesni_dec1
2144	movups STATE, (OUTP)		#output
2145#ifndef __x86_64__
2146	popl KLEN
2147	popl KEYP
2148#endif
2149	FRAME_END
2150	ret
2151SYM_FUNC_END(aesni_dec)
2152
2153/*
2154 * _aesni_dec1:		internal ABI
2155 * input:
2156 *	KEYP:		key struct pointer
2157 *	KLEN:		key length
2158 *	STATE:		initial state (input)
2159 * output:
2160 *	STATE:		finial state (output)
2161 * changed:
2162 *	KEY
2163 *	TKEYP (T1)
2164 */
2165SYM_FUNC_START_LOCAL(_aesni_dec1)
2166	movaps (KEYP), KEY		# key
2167	mov KEYP, TKEYP
2168	pxor KEY, STATE		# round 0
2169	add $0x30, TKEYP
2170	cmp $24, KLEN
2171	jb .Ldec128
2172	lea 0x20(TKEYP), TKEYP
2173	je .Ldec192
2174	add $0x20, TKEYP
2175	movaps -0x60(TKEYP), KEY
2176	aesdec KEY, STATE
2177	movaps -0x50(TKEYP), KEY
2178	aesdec KEY, STATE
2179.align 4
2180.Ldec192:
2181	movaps -0x40(TKEYP), KEY
2182	aesdec KEY, STATE
2183	movaps -0x30(TKEYP), KEY
2184	aesdec KEY, STATE
2185.align 4
2186.Ldec128:
2187	movaps -0x20(TKEYP), KEY
2188	aesdec KEY, STATE
2189	movaps -0x10(TKEYP), KEY
2190	aesdec KEY, STATE
2191	movaps (TKEYP), KEY
2192	aesdec KEY, STATE
2193	movaps 0x10(TKEYP), KEY
2194	aesdec KEY, STATE
2195	movaps 0x20(TKEYP), KEY
2196	aesdec KEY, STATE
2197	movaps 0x30(TKEYP), KEY
2198	aesdec KEY, STATE
2199	movaps 0x40(TKEYP), KEY
2200	aesdec KEY, STATE
2201	movaps 0x50(TKEYP), KEY
2202	aesdec KEY, STATE
2203	movaps 0x60(TKEYP), KEY
2204	aesdec KEY, STATE
2205	movaps 0x70(TKEYP), KEY
2206	aesdeclast KEY, STATE
2207	ret
2208SYM_FUNC_END(_aesni_dec1)
2209
2210/*
2211 * _aesni_dec4:	internal ABI
2212 * input:
2213 *	KEYP:		key struct pointer
2214 *	KLEN:		key length
2215 *	STATE1:		initial state (input)
2216 *	STATE2
2217 *	STATE3
2218 *	STATE4
2219 * output:
2220 *	STATE1:		finial state (output)
2221 *	STATE2
2222 *	STATE3
2223 *	STATE4
2224 * changed:
2225 *	KEY
2226 *	TKEYP (T1)
2227 */
2228SYM_FUNC_START_LOCAL(_aesni_dec4)
2229	movaps (KEYP), KEY		# key
2230	mov KEYP, TKEYP
2231	pxor KEY, STATE1		# round 0
2232	pxor KEY, STATE2
2233	pxor KEY, STATE3
2234	pxor KEY, STATE4
2235	add $0x30, TKEYP
2236	cmp $24, KLEN
2237	jb .L4dec128
2238	lea 0x20(TKEYP), TKEYP
2239	je .L4dec192
2240	add $0x20, TKEYP
2241	movaps -0x60(TKEYP), KEY
2242	aesdec KEY, STATE1
2243	aesdec KEY, STATE2
2244	aesdec KEY, STATE3
2245	aesdec KEY, STATE4
2246	movaps -0x50(TKEYP), KEY
2247	aesdec KEY, STATE1
2248	aesdec KEY, STATE2
2249	aesdec KEY, STATE3
2250	aesdec KEY, STATE4
2251.align 4
2252.L4dec192:
2253	movaps -0x40(TKEYP), KEY
2254	aesdec KEY, STATE1
2255	aesdec KEY, STATE2
2256	aesdec KEY, STATE3
2257	aesdec KEY, STATE4
2258	movaps -0x30(TKEYP), KEY
2259	aesdec KEY, STATE1
2260	aesdec KEY, STATE2
2261	aesdec KEY, STATE3
2262	aesdec KEY, STATE4
2263.align 4
2264.L4dec128:
2265	movaps -0x20(TKEYP), KEY
2266	aesdec KEY, STATE1
2267	aesdec KEY, STATE2
2268	aesdec KEY, STATE3
2269	aesdec KEY, STATE4
2270	movaps -0x10(TKEYP), KEY
2271	aesdec KEY, STATE1
2272	aesdec KEY, STATE2
2273	aesdec KEY, STATE3
2274	aesdec KEY, STATE4
2275	movaps (TKEYP), KEY
2276	aesdec KEY, STATE1
2277	aesdec KEY, STATE2
2278	aesdec KEY, STATE3
2279	aesdec KEY, STATE4
2280	movaps 0x10(TKEYP), KEY
2281	aesdec KEY, STATE1
2282	aesdec KEY, STATE2
2283	aesdec KEY, STATE3
2284	aesdec KEY, STATE4
2285	movaps 0x20(TKEYP), KEY
2286	aesdec KEY, STATE1
2287	aesdec KEY, STATE2
2288	aesdec KEY, STATE3
2289	aesdec KEY, STATE4
2290	movaps 0x30(TKEYP), KEY
2291	aesdec KEY, STATE1
2292	aesdec KEY, STATE2
2293	aesdec KEY, STATE3
2294	aesdec KEY, STATE4
2295	movaps 0x40(TKEYP), KEY
2296	aesdec KEY, STATE1
2297	aesdec KEY, STATE2
2298	aesdec KEY, STATE3
2299	aesdec KEY, STATE4
2300	movaps 0x50(TKEYP), KEY
2301	aesdec KEY, STATE1
2302	aesdec KEY, STATE2
2303	aesdec KEY, STATE3
2304	aesdec KEY, STATE4
2305	movaps 0x60(TKEYP), KEY
2306	aesdec KEY, STATE1
2307	aesdec KEY, STATE2
2308	aesdec KEY, STATE3
2309	aesdec KEY, STATE4
2310	movaps 0x70(TKEYP), KEY
2311	aesdeclast KEY, STATE1		# last round
2312	aesdeclast KEY, STATE2
2313	aesdeclast KEY, STATE3
2314	aesdeclast KEY, STATE4
2315	ret
2316SYM_FUNC_END(_aesni_dec4)
2317
2318/*
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2320 *		      size_t len)
2321 */
2322SYM_FUNC_START(aesni_ecb_enc)
2323	FRAME_BEGIN
2324#ifndef __x86_64__
2325	pushl LEN
2326	pushl KEYP
2327	pushl KLEN
2328	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2329	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2330	movl (FRAME_OFFSET+24)(%esp), INP	# src
2331	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2332#endif
2333	test LEN, LEN		# check length
2334	jz .Lecb_enc_ret
2335	mov 480(KEYP), KLEN
2336	cmp $16, LEN
2337	jb .Lecb_enc_ret
2338	cmp $64, LEN
2339	jb .Lecb_enc_loop1
2340.align 4
2341.Lecb_enc_loop4:
2342	movups (INP), STATE1
2343	movups 0x10(INP), STATE2
2344	movups 0x20(INP), STATE3
2345	movups 0x30(INP), STATE4
2346	call _aesni_enc4
2347	movups STATE1, (OUTP)
2348	movups STATE2, 0x10(OUTP)
2349	movups STATE3, 0x20(OUTP)
2350	movups STATE4, 0x30(OUTP)
2351	sub $64, LEN
2352	add $64, INP
2353	add $64, OUTP
2354	cmp $64, LEN
2355	jge .Lecb_enc_loop4
2356	cmp $16, LEN
2357	jb .Lecb_enc_ret
2358.align 4
2359.Lecb_enc_loop1:
2360	movups (INP), STATE1
2361	call _aesni_enc1
2362	movups STATE1, (OUTP)
2363	sub $16, LEN
2364	add $16, INP
2365	add $16, OUTP
2366	cmp $16, LEN
2367	jge .Lecb_enc_loop1
2368.Lecb_enc_ret:
2369#ifndef __x86_64__
2370	popl KLEN
2371	popl KEYP
2372	popl LEN
2373#endif
2374	FRAME_END
2375	ret
2376SYM_FUNC_END(aesni_ecb_enc)
2377
2378/*
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2380 *		      size_t len);
2381 */
2382SYM_FUNC_START(aesni_ecb_dec)
2383	FRAME_BEGIN
2384#ifndef __x86_64__
2385	pushl LEN
2386	pushl KEYP
2387	pushl KLEN
2388	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2389	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2390	movl (FRAME_OFFSET+24)(%esp), INP	# src
2391	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2392#endif
2393	test LEN, LEN
2394	jz .Lecb_dec_ret
2395	mov 480(KEYP), KLEN
2396	add $240, KEYP
2397	cmp $16, LEN
2398	jb .Lecb_dec_ret
2399	cmp $64, LEN
2400	jb .Lecb_dec_loop1
2401.align 4
2402.Lecb_dec_loop4:
2403	movups (INP), STATE1
2404	movups 0x10(INP), STATE2
2405	movups 0x20(INP), STATE3
2406	movups 0x30(INP), STATE4
2407	call _aesni_dec4
2408	movups STATE1, (OUTP)
2409	movups STATE2, 0x10(OUTP)
2410	movups STATE3, 0x20(OUTP)
2411	movups STATE4, 0x30(OUTP)
2412	sub $64, LEN
2413	add $64, INP
2414	add $64, OUTP
2415	cmp $64, LEN
2416	jge .Lecb_dec_loop4
2417	cmp $16, LEN
2418	jb .Lecb_dec_ret
2419.align 4
2420.Lecb_dec_loop1:
2421	movups (INP), STATE1
2422	call _aesni_dec1
2423	movups STATE1, (OUTP)
2424	sub $16, LEN
2425	add $16, INP
2426	add $16, OUTP
2427	cmp $16, LEN
2428	jge .Lecb_dec_loop1
2429.Lecb_dec_ret:
2430#ifndef __x86_64__
2431	popl KLEN
2432	popl KEYP
2433	popl LEN
2434#endif
2435	FRAME_END
2436	ret
2437SYM_FUNC_END(aesni_ecb_dec)
2438
2439/*
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 *		      size_t len, u8 *iv)
2442 */
2443SYM_FUNC_START(aesni_cbc_enc)
2444	FRAME_BEGIN
2445#ifndef __x86_64__
2446	pushl IVP
2447	pushl LEN
2448	pushl KEYP
2449	pushl KLEN
2450	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2451	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2452	movl (FRAME_OFFSET+28)(%esp), INP	# src
2453	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2454	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2455#endif
2456	cmp $16, LEN
2457	jb .Lcbc_enc_ret
2458	mov 480(KEYP), KLEN
2459	movups (IVP), STATE	# load iv as initial state
2460.align 4
2461.Lcbc_enc_loop:
2462	movups (INP), IN	# load input
2463	pxor IN, STATE
2464	call _aesni_enc1
2465	movups STATE, (OUTP)	# store output
2466	sub $16, LEN
2467	add $16, INP
2468	add $16, OUTP
2469	cmp $16, LEN
2470	jge .Lcbc_enc_loop
2471	movups STATE, (IVP)
2472.Lcbc_enc_ret:
2473#ifndef __x86_64__
2474	popl KLEN
2475	popl KEYP
2476	popl LEN
2477	popl IVP
2478#endif
2479	FRAME_END
2480	ret
2481SYM_FUNC_END(aesni_cbc_enc)
2482
2483/*
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 *		      size_t len, u8 *iv)
2486 */
2487SYM_FUNC_START(aesni_cbc_dec)
2488	FRAME_BEGIN
2489#ifndef __x86_64__
2490	pushl IVP
2491	pushl LEN
2492	pushl KEYP
2493	pushl KLEN
2494	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2495	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2496	movl (FRAME_OFFSET+28)(%esp), INP	# src
2497	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2498	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2499#endif
2500	cmp $16, LEN
2501	jb .Lcbc_dec_just_ret
2502	mov 480(KEYP), KLEN
2503	add $240, KEYP
2504	movups (IVP), IV
2505	cmp $64, LEN
2506	jb .Lcbc_dec_loop1
2507.align 4
2508.Lcbc_dec_loop4:
2509	movups (INP), IN1
2510	movaps IN1, STATE1
2511	movups 0x10(INP), IN2
2512	movaps IN2, STATE2
2513#ifdef __x86_64__
2514	movups 0x20(INP), IN3
2515	movaps IN3, STATE3
2516	movups 0x30(INP), IN4
2517	movaps IN4, STATE4
2518#else
2519	movups 0x20(INP), IN1
2520	movaps IN1, STATE3
2521	movups 0x30(INP), IN2
2522	movaps IN2, STATE4
2523#endif
2524	call _aesni_dec4
2525	pxor IV, STATE1
2526#ifdef __x86_64__
2527	pxor IN1, STATE2
2528	pxor IN2, STATE3
2529	pxor IN3, STATE4
2530	movaps IN4, IV
2531#else
2532	pxor IN1, STATE4
2533	movaps IN2, IV
2534	movups (INP), IN1
2535	pxor IN1, STATE2
2536	movups 0x10(INP), IN2
2537	pxor IN2, STATE3
2538#endif
2539	movups STATE1, (OUTP)
2540	movups STATE2, 0x10(OUTP)
2541	movups STATE3, 0x20(OUTP)
2542	movups STATE4, 0x30(OUTP)
2543	sub $64, LEN
2544	add $64, INP
2545	add $64, OUTP
2546	cmp $64, LEN
2547	jge .Lcbc_dec_loop4
2548	cmp $16, LEN
2549	jb .Lcbc_dec_ret
2550.align 4
2551.Lcbc_dec_loop1:
2552	movups (INP), IN
2553	movaps IN, STATE
2554	call _aesni_dec1
2555	pxor IV, STATE
2556	movups STATE, (OUTP)
2557	movaps IN, IV
2558	sub $16, LEN
2559	add $16, INP
2560	add $16, OUTP
2561	cmp $16, LEN
2562	jge .Lcbc_dec_loop1
2563.Lcbc_dec_ret:
2564	movups IV, (IVP)
2565.Lcbc_dec_just_ret:
2566#ifndef __x86_64__
2567	popl KLEN
2568	popl KEYP
2569	popl LEN
2570	popl IVP
2571#endif
2572	FRAME_END
2573	ret
2574SYM_FUNC_END(aesni_cbc_dec)
2575
2576/*
2577 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2578 *			  size_t len, u8 *iv)
2579 */
2580SYM_FUNC_START(aesni_cts_cbc_enc)
2581	FRAME_BEGIN
2582#ifndef __x86_64__
2583	pushl IVP
2584	pushl LEN
2585	pushl KEYP
2586	pushl KLEN
2587	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2588	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2589	movl (FRAME_OFFSET+28)(%esp), INP	# src
2590	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2591	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2592	lea .Lcts_permute_table, T1
2593#else
2594	lea .Lcts_permute_table(%rip), T1
2595#endif
2596	mov 480(KEYP), KLEN
2597	movups (IVP), STATE
2598	sub $16, LEN
2599	mov T1, IVP
2600	add $32, IVP
2601	add LEN, T1
2602	sub LEN, IVP
2603	movups (T1), %xmm4
2604	movups (IVP), %xmm5
2605
2606	movups (INP), IN1
2607	add LEN, INP
2608	movups (INP), IN2
2609
2610	pxor IN1, STATE
2611	call _aesni_enc1
2612
2613	pshufb %xmm5, IN2
2614	pxor STATE, IN2
2615	pshufb %xmm4, STATE
2616	add OUTP, LEN
2617	movups STATE, (LEN)
2618
2619	movaps IN2, STATE
2620	call _aesni_enc1
2621	movups STATE, (OUTP)
2622
2623#ifndef __x86_64__
2624	popl KLEN
2625	popl KEYP
2626	popl LEN
2627	popl IVP
2628#endif
2629	FRAME_END
2630	ret
2631SYM_FUNC_END(aesni_cts_cbc_enc)
2632
2633/*
2634 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2635 *			  size_t len, u8 *iv)
2636 */
2637SYM_FUNC_START(aesni_cts_cbc_dec)
2638	FRAME_BEGIN
2639#ifndef __x86_64__
2640	pushl IVP
2641	pushl LEN
2642	pushl KEYP
2643	pushl KLEN
2644	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2645	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2646	movl (FRAME_OFFSET+28)(%esp), INP	# src
2647	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2648	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2649	lea .Lcts_permute_table, T1
2650#else
2651	lea .Lcts_permute_table(%rip), T1
2652#endif
2653	mov 480(KEYP), KLEN
2654	add $240, KEYP
2655	movups (IVP), IV
2656	sub $16, LEN
2657	mov T1, IVP
2658	add $32, IVP
2659	add LEN, T1
2660	sub LEN, IVP
2661	movups (T1), %xmm4
2662
2663	movups (INP), STATE
2664	add LEN, INP
2665	movups (INP), IN1
2666
2667	call _aesni_dec1
2668	movaps STATE, IN2
2669	pshufb %xmm4, STATE
2670	pxor IN1, STATE
2671
2672	add OUTP, LEN
2673	movups STATE, (LEN)
2674
2675	movups (IVP), %xmm0
2676	pshufb %xmm0, IN1
2677	pblendvb IN2, IN1
2678	movaps IN1, STATE
2679	call _aesni_dec1
2680
2681	pxor IV, STATE
2682	movups STATE, (OUTP)
2683
2684#ifndef __x86_64__
2685	popl KLEN
2686	popl KEYP
2687	popl LEN
2688	popl IVP
2689#endif
2690	FRAME_END
2691	ret
2692SYM_FUNC_END(aesni_cts_cbc_dec)
2693
2694.pushsection .rodata
2695.align 16
2696.Lcts_permute_table:
2697	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2698	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2699	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2700	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2701	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2702	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2703#ifdef __x86_64__
2704.Lbswap_mask:
2705	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2706#endif
2707.popsection
2708
2709#ifdef __x86_64__
2710/*
2711 * _aesni_inc_init:	internal ABI
2712 *	setup registers used by _aesni_inc
2713 * input:
2714 *	IV
2715 * output:
2716 *	CTR:	== IV, in little endian
2717 *	TCTR_LOW: == lower qword of CTR
2718 *	INC:	== 1, in little endian
2719 *	BSWAP_MASK == endian swapping mask
2720 */
2721SYM_FUNC_START_LOCAL(_aesni_inc_init)
2722	movaps .Lbswap_mask, BSWAP_MASK
2723	movaps IV, CTR
2724	pshufb BSWAP_MASK, CTR
2725	mov $1, TCTR_LOW
2726	movq TCTR_LOW, INC
2727	movq CTR, TCTR_LOW
2728	ret
2729SYM_FUNC_END(_aesni_inc_init)
2730
2731/*
2732 * _aesni_inc:		internal ABI
2733 *	Increase IV by 1, IV is in big endian
2734 * input:
2735 *	IV
2736 *	CTR:	== IV, in little endian
2737 *	TCTR_LOW: == lower qword of CTR
2738 *	INC:	== 1, in little endian
2739 *	BSWAP_MASK == endian swapping mask
2740 * output:
2741 *	IV:	Increase by 1
2742 * changed:
2743 *	CTR:	== output IV, in little endian
2744 *	TCTR_LOW: == lower qword of CTR
2745 */
2746SYM_FUNC_START_LOCAL(_aesni_inc)
2747	paddq INC, CTR
2748	add $1, TCTR_LOW
2749	jnc .Linc_low
2750	pslldq $8, INC
2751	paddq INC, CTR
2752	psrldq $8, INC
2753.Linc_low:
2754	movaps CTR, IV
2755	pshufb BSWAP_MASK, IV
2756	ret
2757SYM_FUNC_END(_aesni_inc)
2758
2759/*
2760 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2761 *		      size_t len, u8 *iv)
2762 */
2763SYM_FUNC_START(aesni_ctr_enc)
2764	FRAME_BEGIN
2765	cmp $16, LEN
2766	jb .Lctr_enc_just_ret
2767	mov 480(KEYP), KLEN
2768	movups (IVP), IV
2769	call _aesni_inc_init
2770	cmp $64, LEN
2771	jb .Lctr_enc_loop1
2772.align 4
2773.Lctr_enc_loop4:
2774	movaps IV, STATE1
2775	call _aesni_inc
2776	movups (INP), IN1
2777	movaps IV, STATE2
2778	call _aesni_inc
2779	movups 0x10(INP), IN2
2780	movaps IV, STATE3
2781	call _aesni_inc
2782	movups 0x20(INP), IN3
2783	movaps IV, STATE4
2784	call _aesni_inc
2785	movups 0x30(INP), IN4
2786	call _aesni_enc4
2787	pxor IN1, STATE1
2788	movups STATE1, (OUTP)
2789	pxor IN2, STATE2
2790	movups STATE2, 0x10(OUTP)
2791	pxor IN3, STATE3
2792	movups STATE3, 0x20(OUTP)
2793	pxor IN4, STATE4
2794	movups STATE4, 0x30(OUTP)
2795	sub $64, LEN
2796	add $64, INP
2797	add $64, OUTP
2798	cmp $64, LEN
2799	jge .Lctr_enc_loop4
2800	cmp $16, LEN
2801	jb .Lctr_enc_ret
2802.align 4
2803.Lctr_enc_loop1:
2804	movaps IV, STATE
2805	call _aesni_inc
2806	movups (INP), IN
2807	call _aesni_enc1
2808	pxor IN, STATE
2809	movups STATE, (OUTP)
2810	sub $16, LEN
2811	add $16, INP
2812	add $16, OUTP
2813	cmp $16, LEN
2814	jge .Lctr_enc_loop1
2815.Lctr_enc_ret:
2816	movups IV, (IVP)
2817.Lctr_enc_just_ret:
2818	FRAME_END
2819	ret
2820SYM_FUNC_END(aesni_ctr_enc)
2821
2822#endif
2823
2824.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2825.align 16
2826.Lgf128mul_x_ble_mask:
2827	.octa 0x00000000000000010000000000000087
2828.previous
2829
2830/*
2831 * _aesni_gf128mul_x_ble:		internal ABI
2832 *	Multiply in GF(2^128) for XTS IVs
2833 * input:
2834 *	IV:	current IV
2835 *	GF128MUL_MASK == mask with 0x87 and 0x01
2836 * output:
2837 *	IV:	next IV
2838 * changed:
2839 *	CTR:	== temporary value
2840 */
2841#define _aesni_gf128mul_x_ble() \
2842	pshufd $0x13, IV, KEY; \
2843	paddq IV, IV; \
2844	psrad $31, KEY; \
2845	pand GF128MUL_MASK, KEY; \
2846	pxor KEY, IV;
2847
2848/*
2849 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2850 *			  const u8 *src, unsigned int len, le128 *iv)
2851 */
2852SYM_FUNC_START(aesni_xts_encrypt)
2853	FRAME_BEGIN
2854#ifndef __x86_64__
2855	pushl IVP
2856	pushl LEN
2857	pushl KEYP
2858	pushl KLEN
2859	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2860	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2861	movl (FRAME_OFFSET+28)(%esp), INP	# src
2862	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2863	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2864	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2865#else
2866	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2867#endif
2868	movups (IVP), IV
2869
2870	mov 480(KEYP), KLEN
2871
2872.Lxts_enc_loop4:
2873	sub $64, LEN
2874	jl .Lxts_enc_1x
2875
2876	movdqa IV, STATE1
2877	movdqu 0x00(INP), IN
2878	pxor IN, STATE1
2879	movdqu IV, 0x00(OUTP)
2880
2881	_aesni_gf128mul_x_ble()
2882	movdqa IV, STATE2
2883	movdqu 0x10(INP), IN
2884	pxor IN, STATE2
2885	movdqu IV, 0x10(OUTP)
2886
2887	_aesni_gf128mul_x_ble()
2888	movdqa IV, STATE3
2889	movdqu 0x20(INP), IN
2890	pxor IN, STATE3
2891	movdqu IV, 0x20(OUTP)
2892
2893	_aesni_gf128mul_x_ble()
2894	movdqa IV, STATE4
2895	movdqu 0x30(INP), IN
2896	pxor IN, STATE4
2897	movdqu IV, 0x30(OUTP)
2898
2899	call _aesni_enc4
2900
2901	movdqu 0x00(OUTP), IN
2902	pxor IN, STATE1
2903	movdqu STATE1, 0x00(OUTP)
2904
2905	movdqu 0x10(OUTP), IN
2906	pxor IN, STATE2
2907	movdqu STATE2, 0x10(OUTP)
2908
2909	movdqu 0x20(OUTP), IN
2910	pxor IN, STATE3
2911	movdqu STATE3, 0x20(OUTP)
2912
2913	movdqu 0x30(OUTP), IN
2914	pxor IN, STATE4
2915	movdqu STATE4, 0x30(OUTP)
2916
2917	_aesni_gf128mul_x_ble()
2918
2919	add $64, INP
2920	add $64, OUTP
2921	test LEN, LEN
2922	jnz .Lxts_enc_loop4
2923
2924.Lxts_enc_ret_iv:
2925	movups IV, (IVP)
2926
2927.Lxts_enc_ret:
2928#ifndef __x86_64__
2929	popl KLEN
2930	popl KEYP
2931	popl LEN
2932	popl IVP
2933#endif
2934	FRAME_END
2935	ret
2936
2937.Lxts_enc_1x:
2938	add $64, LEN
2939	jz .Lxts_enc_ret_iv
2940	sub $16, LEN
2941	jl .Lxts_enc_cts4
2942
2943.Lxts_enc_loop1:
2944	movdqu (INP), STATE
2945	pxor IV, STATE
2946	call _aesni_enc1
2947	pxor IV, STATE
2948	_aesni_gf128mul_x_ble()
2949
2950	test LEN, LEN
2951	jz .Lxts_enc_out
2952
2953	add $16, INP
2954	sub $16, LEN
2955	jl .Lxts_enc_cts1
2956
2957	movdqu STATE, (OUTP)
2958	add $16, OUTP
2959	jmp .Lxts_enc_loop1
2960
2961.Lxts_enc_out:
2962	movdqu STATE, (OUTP)
2963	jmp .Lxts_enc_ret_iv
2964
2965.Lxts_enc_cts4:
2966	movdqa STATE4, STATE
2967	sub $16, OUTP
2968
2969.Lxts_enc_cts1:
2970#ifndef __x86_64__
2971	lea .Lcts_permute_table, T1
2972#else
2973	lea .Lcts_permute_table(%rip), T1
2974#endif
2975	add LEN, INP		/* rewind input pointer */
2976	add $16, LEN		/* # bytes in final block */
2977	movups (INP), IN1
2978
2979	mov T1, IVP
2980	add $32, IVP
2981	add LEN, T1
2982	sub LEN, IVP
2983	add OUTP, LEN
2984
2985	movups (T1), %xmm4
2986	movaps STATE, IN2
2987	pshufb %xmm4, STATE
2988	movups STATE, (LEN)
2989
2990	movups (IVP), %xmm0
2991	pshufb %xmm0, IN1
2992	pblendvb IN2, IN1
2993	movaps IN1, STATE
2994
2995	pxor IV, STATE
2996	call _aesni_enc1
2997	pxor IV, STATE
2998
2999	movups STATE, (OUTP)
3000	jmp .Lxts_enc_ret
3001SYM_FUNC_END(aesni_xts_encrypt)
3002
3003/*
3004 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3005 *			  const u8 *src, unsigned int len, le128 *iv)
3006 */
3007SYM_FUNC_START(aesni_xts_decrypt)
3008	FRAME_BEGIN
3009#ifndef __x86_64__
3010	pushl IVP
3011	pushl LEN
3012	pushl KEYP
3013	pushl KLEN
3014	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
3015	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
3016	movl (FRAME_OFFSET+28)(%esp), INP	# src
3017	movl (FRAME_OFFSET+32)(%esp), LEN	# len
3018	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
3019	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3020#else
3021	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3022#endif
3023	movups (IVP), IV
3024
3025	mov 480(KEYP), KLEN
3026	add $240, KEYP
3027
3028	test $15, LEN
3029	jz .Lxts_dec_loop4
3030	sub $16, LEN
3031
3032.Lxts_dec_loop4:
3033	sub $64, LEN
3034	jl .Lxts_dec_1x
3035
3036	movdqa IV, STATE1
3037	movdqu 0x00(INP), IN
3038	pxor IN, STATE1
3039	movdqu IV, 0x00(OUTP)
3040
3041	_aesni_gf128mul_x_ble()
3042	movdqa IV, STATE2
3043	movdqu 0x10(INP), IN
3044	pxor IN, STATE2
3045	movdqu IV, 0x10(OUTP)
3046
3047	_aesni_gf128mul_x_ble()
3048	movdqa IV, STATE3
3049	movdqu 0x20(INP), IN
3050	pxor IN, STATE3
3051	movdqu IV, 0x20(OUTP)
3052
3053	_aesni_gf128mul_x_ble()
3054	movdqa IV, STATE4
3055	movdqu 0x30(INP), IN
3056	pxor IN, STATE4
3057	movdqu IV, 0x30(OUTP)
3058
3059	call _aesni_dec4
3060
3061	movdqu 0x00(OUTP), IN
3062	pxor IN, STATE1
3063	movdqu STATE1, 0x00(OUTP)
3064
3065	movdqu 0x10(OUTP), IN
3066	pxor IN, STATE2
3067	movdqu STATE2, 0x10(OUTP)
3068
3069	movdqu 0x20(OUTP), IN
3070	pxor IN, STATE3
3071	movdqu STATE3, 0x20(OUTP)
3072
3073	movdqu 0x30(OUTP), IN
3074	pxor IN, STATE4
3075	movdqu STATE4, 0x30(OUTP)
3076
3077	_aesni_gf128mul_x_ble()
3078
3079	add $64, INP
3080	add $64, OUTP
3081	test LEN, LEN
3082	jnz .Lxts_dec_loop4
3083
3084.Lxts_dec_ret_iv:
3085	movups IV, (IVP)
3086
3087.Lxts_dec_ret:
3088#ifndef __x86_64__
3089	popl KLEN
3090	popl KEYP
3091	popl LEN
3092	popl IVP
3093#endif
3094	FRAME_END
3095	ret
3096
3097.Lxts_dec_1x:
3098	add $64, LEN
3099	jz .Lxts_dec_ret_iv
3100
3101.Lxts_dec_loop1:
3102	movdqu (INP), STATE
3103
3104	add $16, INP
3105	sub $16, LEN
3106	jl .Lxts_dec_cts1
3107
3108	pxor IV, STATE
3109	call _aesni_dec1
3110	pxor IV, STATE
3111	_aesni_gf128mul_x_ble()
3112
3113	test LEN, LEN
3114	jz .Lxts_dec_out
3115
3116	movdqu STATE, (OUTP)
3117	add $16, OUTP
3118	jmp .Lxts_dec_loop1
3119
3120.Lxts_dec_out:
3121	movdqu STATE, (OUTP)
3122	jmp .Lxts_dec_ret_iv
3123
3124.Lxts_dec_cts1:
3125	movdqa IV, STATE4
3126	_aesni_gf128mul_x_ble()
3127
3128	pxor IV, STATE
3129	call _aesni_dec1
3130	pxor IV, STATE
3131
3132#ifndef __x86_64__
3133	lea .Lcts_permute_table, T1
3134#else
3135	lea .Lcts_permute_table(%rip), T1
3136#endif
3137	add LEN, INP		/* rewind input pointer */
3138	add $16, LEN		/* # bytes in final block */
3139	movups (INP), IN1
3140
3141	mov T1, IVP
3142	add $32, IVP
3143	add LEN, T1
3144	sub LEN, IVP
3145	add OUTP, LEN
3146
3147	movups (T1), %xmm4
3148	movaps STATE, IN2
3149	pshufb %xmm4, STATE
3150	movups STATE, (LEN)
3151
3152	movups (IVP), %xmm0
3153	pshufb %xmm0, IN1
3154	pblendvb IN2, IN1
3155	movaps IN1, STATE
3156
3157	pxor STATE4, STATE
3158	call _aesni_dec1
3159	pxor STATE4, STATE
3160
3161	movups STATE, (OUTP)
3162	jmp .Lxts_dec_ret
3163SYM_FUNC_END(aesni_xts_decrypt)
3164