xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision f97cee494dc92395a668445bcd24d34c89f4ff8c)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17 *             Adrian Hoban <adrian.hoban@intel.com>
18 *             James Guilford (james.guilford@intel.com)
19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20 *             Tadeusz Struk (tadeusz.struk@intel.com)
21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22 *    Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 *    Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/frame.h>
30#include <asm/nospec-branch.h>
31
32/*
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register.  This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned).  It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
39 */
40#define MOVADQ	movaps
41#define MOVUDQ	movups
42
43#ifdef __x86_64__
44
45# constants in mergeable sections, linker can reorder and merge
46.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
47.align 16
48.Lgf128mul_x_ble_mask:
49	.octa 0x00000000000000010000000000000087
50.section	.rodata.cst16.POLY, "aM", @progbits, 16
51.align 16
52POLY:   .octa 0xC2000000000000000000000000000001
53.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
54.align 16
55TWOONE: .octa 0x00000001000000000000000000000001
56
57.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
58.align 16
59SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
60.section	.rodata.cst16.MASK1, "aM", @progbits, 16
61.align 16
62MASK1:      .octa 0x0000000000000000ffffffffffffffff
63.section	.rodata.cst16.MASK2, "aM", @progbits, 16
64.align 16
65MASK2:      .octa 0xffffffffffffffff0000000000000000
66.section	.rodata.cst16.ONE, "aM", @progbits, 16
67.align 16
68ONE:        .octa 0x00000000000000000000000000000001
69.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
70.align 16
71F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
72.section	.rodata.cst16.dec, "aM", @progbits, 16
73.align 16
74dec:        .octa 0x1
75.section	.rodata.cst16.enc, "aM", @progbits, 16
76.align 16
77enc:        .octa 0x2
78
79# order of these constants should not change.
80# more specifically, ALL_F should follow SHIFT_MASK,
81# and zero should follow ALL_F
82.section	.rodata, "a", @progbits
83.align 16
84SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
85ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
86            .octa 0x00000000000000000000000000000000
87
88.text
89
90
91#define	STACK_OFFSET    8*3
92
93#define AadHash 16*0
94#define AadLen 16*1
95#define InLen (16*1)+8
96#define PBlockEncKey 16*2
97#define OrigIV 16*3
98#define CurCount 16*4
99#define PBlockLen 16*5
100#define	HashKey		16*6	// store HashKey <<1 mod poly here
101#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
102#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
103#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
104#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
105				// bits of  HashKey <<1 mod poly here
106				//(for Karatsuba purposes)
107#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
108				// bits of  HashKey^2 <<1 mod poly here
109				// (for Karatsuba purposes)
110#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
111				// bits of  HashKey^3 <<1 mod poly here
112				// (for Karatsuba purposes)
113#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
114				// bits of  HashKey^4 <<1 mod poly here
115				// (for Karatsuba purposes)
116
117#define arg1 rdi
118#define arg2 rsi
119#define arg3 rdx
120#define arg4 rcx
121#define arg5 r8
122#define arg6 r9
123#define arg7 STACK_OFFSET+8(%rsp)
124#define arg8 STACK_OFFSET+16(%rsp)
125#define arg9 STACK_OFFSET+24(%rsp)
126#define arg10 STACK_OFFSET+32(%rsp)
127#define arg11 STACK_OFFSET+40(%rsp)
128#define keysize 2*15*16(%arg1)
129#endif
130
131
132#define STATE1	%xmm0
133#define STATE2	%xmm4
134#define STATE3	%xmm5
135#define STATE4	%xmm6
136#define STATE	STATE1
137#define IN1	%xmm1
138#define IN2	%xmm7
139#define IN3	%xmm8
140#define IN4	%xmm9
141#define IN	IN1
142#define KEY	%xmm2
143#define IV	%xmm3
144
145#define BSWAP_MASK %xmm10
146#define CTR	%xmm11
147#define INC	%xmm12
148
149#define GF128MUL_MASK %xmm10
150
151#ifdef __x86_64__
152#define AREG	%rax
153#define KEYP	%rdi
154#define OUTP	%rsi
155#define UKEYP	OUTP
156#define INP	%rdx
157#define LEN	%rcx
158#define IVP	%r8
159#define KLEN	%r9d
160#define T1	%r10
161#define TKEYP	T1
162#define T2	%r11
163#define TCTR_LOW T2
164#else
165#define AREG	%eax
166#define KEYP	%edi
167#define OUTP	AREG
168#define UKEYP	OUTP
169#define INP	%edx
170#define LEN	%esi
171#define IVP	%ebp
172#define KLEN	%ebx
173#define T1	%ecx
174#define TKEYP	T1
175#endif
176
177.macro FUNC_SAVE
178	push	%r12
179	push	%r13
180	push	%r14
181#
182# states of %xmm registers %xmm6:%xmm15 not saved
183# all %xmm registers are clobbered
184#
185.endm
186
187
188.macro FUNC_RESTORE
189	pop	%r14
190	pop	%r13
191	pop	%r12
192.endm
193
194# Precompute hashkeys.
195# Input: Hash subkey.
196# Output: HashKeys stored in gcm_context_data.  Only needs to be called
197# once per key.
198# clobbers r12, and tmp xmm registers.
199.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
200	mov	\SUBKEY, %r12
201	movdqu	(%r12), \TMP3
202	movdqa	SHUF_MASK(%rip), \TMP2
203	pshufb	\TMP2, \TMP3
204
205	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
206
207	movdqa	\TMP3, \TMP2
208	psllq	$1, \TMP3
209	psrlq	$63, \TMP2
210	movdqa	\TMP2, \TMP1
211	pslldq	$8, \TMP2
212	psrldq	$8, \TMP1
213	por	\TMP2, \TMP3
214
215	# reduce HashKey<<1
216
217	pshufd	$0x24, \TMP1, \TMP2
218	pcmpeqd TWOONE(%rip), \TMP2
219	pand	POLY(%rip), \TMP2
220	pxor	\TMP2, \TMP3
221	movdqu	\TMP3, HashKey(%arg2)
222
223	movdqa	   \TMP3, \TMP5
224	pshufd	   $78, \TMP3, \TMP1
225	pxor	   \TMP3, \TMP1
226	movdqu	   \TMP1, HashKey_k(%arg2)
227
228	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
229# TMP5 = HashKey^2<<1 (mod poly)
230	movdqu	   \TMP5, HashKey_2(%arg2)
231# HashKey_2 = HashKey^2<<1 (mod poly)
232	pshufd	   $78, \TMP5, \TMP1
233	pxor	   \TMP5, \TMP1
234	movdqu	   \TMP1, HashKey_2_k(%arg2)
235
236	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
237# TMP5 = HashKey^3<<1 (mod poly)
238	movdqu	   \TMP5, HashKey_3(%arg2)
239	pshufd	   $78, \TMP5, \TMP1
240	pxor	   \TMP5, \TMP1
241	movdqu	   \TMP1, HashKey_3_k(%arg2)
242
243	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
244# TMP5 = HashKey^3<<1 (mod poly)
245	movdqu	   \TMP5, HashKey_4(%arg2)
246	pshufd	   $78, \TMP5, \TMP1
247	pxor	   \TMP5, \TMP1
248	movdqu	   \TMP1, HashKey_4_k(%arg2)
249.endm
250
251# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
252# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
253.macro GCM_INIT Iv SUBKEY AAD AADLEN
254	mov \AADLEN, %r11
255	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
256	xor %r11d, %r11d
257	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
258	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
259	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
260	mov \Iv, %rax
261	movdqu (%rax), %xmm0
262	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
263
264	movdqa  SHUF_MASK(%rip), %xmm2
265	pshufb %xmm2, %xmm0
266	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
267
268	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
269	movdqu HashKey(%arg2), %xmm13
270
271	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
272	%xmm4, %xmm5, %xmm6
273.endm
274
275# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
276# struct has been initialized by GCM_INIT.
277# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
278# Clobbers rax, r10-r13, and xmm0-xmm15
279.macro GCM_ENC_DEC operation
280	movdqu AadHash(%arg2), %xmm8
281	movdqu HashKey(%arg2), %xmm13
282	add %arg5, InLen(%arg2)
283
284	xor %r11d, %r11d # initialise the data pointer offset as zero
285	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
286
287	sub %r11, %arg5		# sub partial block data used
288	mov %arg5, %r13		# save the number of bytes
289
290	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
291	mov %r13, %r12
292	# Encrypt/Decrypt first few blocks
293
294	and	$(3<<4), %r12
295	jz	_initial_num_blocks_is_0_\@
296	cmp	$(2<<4), %r12
297	jb	_initial_num_blocks_is_1_\@
298	je	_initial_num_blocks_is_2_\@
299_initial_num_blocks_is_3_\@:
300	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
301%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
302	sub	$48, %r13
303	jmp	_initial_blocks_\@
304_initial_num_blocks_is_2_\@:
305	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
307	sub	$32, %r13
308	jmp	_initial_blocks_\@
309_initial_num_blocks_is_1_\@:
310	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
312	sub	$16, %r13
313	jmp	_initial_blocks_\@
314_initial_num_blocks_is_0_\@:
315	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
317_initial_blocks_\@:
318
319	# Main loop - Encrypt/Decrypt remaining blocks
320
321	cmp	$0, %r13
322	je	_zero_cipher_left_\@
323	sub	$64, %r13
324	je	_four_cipher_left_\@
325_crypt_by_4_\@:
326	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
327	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
328	%xmm7, %xmm8, enc
329	add	$64, %r11
330	sub	$64, %r13
331	jne	_crypt_by_4_\@
332_four_cipher_left_\@:
333	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
334%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
335_zero_cipher_left_\@:
336	movdqu %xmm8, AadHash(%arg2)
337	movdqu %xmm0, CurCount(%arg2)
338
339	mov	%arg5, %r13
340	and	$15, %r13			# %r13 = arg5 (mod 16)
341	je	_multiple_of_16_bytes_\@
342
343	mov %r13, PBlockLen(%arg2)
344
345	# Handle the last <16 Byte block separately
346	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
347	movdqu %xmm0, CurCount(%arg2)
348	movdqa SHUF_MASK(%rip), %xmm10
349	pshufb %xmm10, %xmm0
350
351	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
352	movdqu %xmm0, PBlockEncKey(%arg2)
353
354	cmp	$16, %arg5
355	jge _large_enough_update_\@
356
357	lea (%arg4,%r11,1), %r10
358	mov %r13, %r12
359	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
360	jmp _data_read_\@
361
362_large_enough_update_\@:
363	sub	$16, %r11
364	add	%r13, %r11
365
366	# receive the last <16 Byte block
367	movdqu	(%arg4, %r11, 1), %xmm1
368
369	sub	%r13, %r11
370	add	$16, %r11
371
372	lea	SHIFT_MASK+16(%rip), %r12
373	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
374	# (r13 is the number of bytes in plaintext mod 16)
375	sub	%r13, %r12
376	# get the appropriate shuffle mask
377	movdqu	(%r12), %xmm2
378	# shift right 16-r13 bytes
379	pshufb  %xmm2, %xmm1
380
381_data_read_\@:
382	lea ALL_F+16(%rip), %r12
383	sub %r13, %r12
384
385.ifc \operation, dec
386	movdqa  %xmm1, %xmm2
387.endif
388	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
389	movdqu	(%r12), %xmm1
390	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
391	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
392.ifc \operation, dec
393	pand    %xmm1, %xmm2
394	movdqa SHUF_MASK(%rip), %xmm10
395	pshufb %xmm10 ,%xmm2
396
397	pxor %xmm2, %xmm8
398.else
399	movdqa SHUF_MASK(%rip), %xmm10
400	pshufb %xmm10,%xmm0
401
402	pxor	%xmm0, %xmm8
403.endif
404
405	movdqu %xmm8, AadHash(%arg2)
406.ifc \operation, enc
407	# GHASH computation for the last <16 byte block
408	movdqa SHUF_MASK(%rip), %xmm10
409	# shuffle xmm0 back to output as ciphertext
410	pshufb %xmm10, %xmm0
411.endif
412
413	# Output %r13 bytes
414	movq %xmm0, %rax
415	cmp $8, %r13
416	jle _less_than_8_bytes_left_\@
417	mov %rax, (%arg3 , %r11, 1)
418	add $8, %r11
419	psrldq $8, %xmm0
420	movq %xmm0, %rax
421	sub $8, %r13
422_less_than_8_bytes_left_\@:
423	mov %al,  (%arg3, %r11, 1)
424	add $1, %r11
425	shr $8, %rax
426	sub $1, %r13
427	jne _less_than_8_bytes_left_\@
428_multiple_of_16_bytes_\@:
429.endm
430
431# GCM_COMPLETE Finishes update of tag of last partial block
432# Output: Authorization Tag (AUTH_TAG)
433# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
434.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
435	movdqu AadHash(%arg2), %xmm8
436	movdqu HashKey(%arg2), %xmm13
437
438	mov PBlockLen(%arg2), %r12
439
440	cmp $0, %r12
441	je _partial_done\@
442
443	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
444
445_partial_done\@:
446	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
447	shl	$3, %r12		  # convert into number of bits
448	movd	%r12d, %xmm15		  # len(A) in %xmm15
449	mov InLen(%arg2), %r12
450	shl     $3, %r12                  # len(C) in bits (*128)
451	movq    %r12, %xmm1
452
453	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
454	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
455	pxor	%xmm15, %xmm8
456	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
457	# final GHASH computation
458	movdqa SHUF_MASK(%rip), %xmm10
459	pshufb %xmm10, %xmm8
460
461	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
462	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
463	pxor	%xmm8, %xmm0
464_return_T_\@:
465	mov	\AUTHTAG, %r10                     # %r10 = authTag
466	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
467	cmp	$16, %r11
468	je	_T_16_\@
469	cmp	$8, %r11
470	jl	_T_4_\@
471_T_8_\@:
472	movq	%xmm0, %rax
473	mov	%rax, (%r10)
474	add	$8, %r10
475	sub	$8, %r11
476	psrldq	$8, %xmm0
477	cmp	$0, %r11
478	je	_return_T_done_\@
479_T_4_\@:
480	movd	%xmm0, %eax
481	mov	%eax, (%r10)
482	add	$4, %r10
483	sub	$4, %r11
484	psrldq	$4, %xmm0
485	cmp	$0, %r11
486	je	_return_T_done_\@
487_T_123_\@:
488	movd	%xmm0, %eax
489	cmp	$2, %r11
490	jl	_T_1_\@
491	mov	%ax, (%r10)
492	cmp	$2, %r11
493	je	_return_T_done_\@
494	add	$2, %r10
495	sar	$16, %eax
496_T_1_\@:
497	mov	%al, (%r10)
498	jmp	_return_T_done_\@
499_T_16_\@:
500	movdqu	%xmm0, (%r10)
501_return_T_done_\@:
502.endm
503
504#ifdef __x86_64__
505/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
506*
507*
508* Input: A and B (128-bits each, bit-reflected)
509* Output: C = A*B*x mod poly, (i.e. >>1 )
510* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
511* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
512*
513*/
514.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
515	movdqa	  \GH, \TMP1
516	pshufd	  $78, \GH, \TMP2
517	pshufd	  $78, \HK, \TMP3
518	pxor	  \GH, \TMP2            # TMP2 = a1+a0
519	pxor	  \HK, \TMP3            # TMP3 = b1+b0
520	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
521	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
522	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
523	pxor	  \GH, \TMP2
524	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
525	movdqa	  \TMP2, \TMP3
526	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
527	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
528	pxor	  \TMP3, \GH
529	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
530
531        # first phase of the reduction
532
533	movdqa    \GH, \TMP2
534	movdqa    \GH, \TMP3
535	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
536					# in in order to perform
537					# independent shifts
538	pslld     $31, \TMP2            # packed right shift <<31
539	pslld     $30, \TMP3            # packed right shift <<30
540	pslld     $25, \TMP4            # packed right shift <<25
541	pxor      \TMP3, \TMP2          # xor the shifted versions
542	pxor      \TMP4, \TMP2
543	movdqa    \TMP2, \TMP5
544	psrldq    $4, \TMP5             # right shift TMP5 1 DW
545	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
546	pxor      \TMP2, \GH
547
548        # second phase of the reduction
549
550	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
551					# in in order to perform
552					# independent shifts
553	movdqa    \GH,\TMP3
554	movdqa    \GH,\TMP4
555	psrld     $1,\TMP2              # packed left shift >>1
556	psrld     $2,\TMP3              # packed left shift >>2
557	psrld     $7,\TMP4              # packed left shift >>7
558	pxor      \TMP3,\TMP2		# xor the shifted versions
559	pxor      \TMP4,\TMP2
560	pxor      \TMP5, \TMP2
561	pxor      \TMP2, \GH
562	pxor      \TMP1, \GH            # result is in TMP1
563.endm
564
565# Reads DLEN bytes starting at DPTR and stores in XMMDst
566# where 0 < DLEN < 16
567# Clobbers %rax, DLEN and XMM1
568.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
569        cmp $8, \DLEN
570        jl _read_lt8_\@
571        mov (\DPTR), %rax
572        movq %rax, \XMMDst
573        sub $8, \DLEN
574        jz _done_read_partial_block_\@
575	xor %eax, %eax
576_read_next_byte_\@:
577        shl $8, %rax
578        mov 7(\DPTR, \DLEN, 1), %al
579        dec \DLEN
580        jnz _read_next_byte_\@
581        movq %rax, \XMM1
582	pslldq $8, \XMM1
583        por \XMM1, \XMMDst
584	jmp _done_read_partial_block_\@
585_read_lt8_\@:
586	xor %eax, %eax
587_read_next_byte_lt8_\@:
588        shl $8, %rax
589        mov -1(\DPTR, \DLEN, 1), %al
590        dec \DLEN
591        jnz _read_next_byte_lt8_\@
592        movq %rax, \XMMDst
593_done_read_partial_block_\@:
594.endm
595
596# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
597# clobbers r10-11, xmm14
598.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
599	TMP6 TMP7
600	MOVADQ	   SHUF_MASK(%rip), %xmm14
601	mov	   \AAD, %r10		# %r10 = AAD
602	mov	   \AADLEN, %r11		# %r11 = aadLen
603	pxor	   \TMP7, \TMP7
604	pxor	   \TMP6, \TMP6
605
606	cmp	   $16, %r11
607	jl	   _get_AAD_rest\@
608_get_AAD_blocks\@:
609	movdqu	   (%r10), \TMP7
610	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
611	pxor	   \TMP7, \TMP6
612	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
613	add	   $16, %r10
614	sub	   $16, %r11
615	cmp	   $16, %r11
616	jge	   _get_AAD_blocks\@
617
618	movdqu	   \TMP6, \TMP7
619
620	/* read the last <16B of AAD */
621_get_AAD_rest\@:
622	cmp	   $0, %r11
623	je	   _get_AAD_done\@
624
625	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
626	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
627	pxor	   \TMP6, \TMP7
628	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
629	movdqu \TMP7, \TMP6
630
631_get_AAD_done\@:
632	movdqu \TMP6, AadHash(%arg2)
633.endm
634
635# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
636# between update calls.
637# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
638# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
639# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
640.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
641	AAD_HASH operation
642	mov 	PBlockLen(%arg2), %r13
643	cmp	$0, %r13
644	je	_partial_block_done_\@	# Leave Macro if no partial blocks
645	# Read in input data without over reading
646	cmp	$16, \PLAIN_CYPH_LEN
647	jl	_fewer_than_16_bytes_\@
648	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
649	jmp	_data_read_\@
650
651_fewer_than_16_bytes_\@:
652	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
653	mov	\PLAIN_CYPH_LEN, %r12
654	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
655
656	mov PBlockLen(%arg2), %r13
657
658_data_read_\@:				# Finished reading in data
659
660	movdqu	PBlockEncKey(%arg2), %xmm9
661	movdqu	HashKey(%arg2), %xmm13
662
663	lea	SHIFT_MASK(%rip), %r12
664
665	# adjust the shuffle mask pointer to be able to shift r13 bytes
666	# r16-r13 is the number of bytes in plaintext mod 16)
667	add	%r13, %r12
668	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
669	pshufb	%xmm2, %xmm9		# shift right r13 bytes
670
671.ifc \operation, dec
672	movdqa	%xmm1, %xmm3
673	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
674
675	mov	\PLAIN_CYPH_LEN, %r10
676	add	%r13, %r10
677	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
678	sub	$16, %r10
679	# Determine if if partial block is not being filled and
680	# shift mask accordingly
681	jge	_no_extra_mask_1_\@
682	sub	%r10, %r12
683_no_extra_mask_1_\@:
684
685	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
686	# get the appropriate mask to mask out bottom r13 bytes of xmm9
687	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
688
689	pand	%xmm1, %xmm3
690	movdqa	SHUF_MASK(%rip), %xmm10
691	pshufb	%xmm10, %xmm3
692	pshufb	%xmm2, %xmm3
693	pxor	%xmm3, \AAD_HASH
694
695	cmp	$0, %r10
696	jl	_partial_incomplete_1_\@
697
698	# GHASH computation for the last <16 Byte block
699	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
700	xor	%eax, %eax
701
702	mov	%rax, PBlockLen(%arg2)
703	jmp	_dec_done_\@
704_partial_incomplete_1_\@:
705	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
706_dec_done_\@:
707	movdqu	\AAD_HASH, AadHash(%arg2)
708.else
709	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
710
711	mov	\PLAIN_CYPH_LEN, %r10
712	add	%r13, %r10
713	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
714	sub	$16, %r10
715	# Determine if if partial block is not being filled and
716	# shift mask accordingly
717	jge	_no_extra_mask_2_\@
718	sub	%r10, %r12
719_no_extra_mask_2_\@:
720
721	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
722	# get the appropriate mask to mask out bottom r13 bytes of xmm9
723	pand	%xmm1, %xmm9
724
725	movdqa	SHUF_MASK(%rip), %xmm1
726	pshufb	%xmm1, %xmm9
727	pshufb	%xmm2, %xmm9
728	pxor	%xmm9, \AAD_HASH
729
730	cmp	$0, %r10
731	jl	_partial_incomplete_2_\@
732
733	# GHASH computation for the last <16 Byte block
734	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
735	xor	%eax, %eax
736
737	mov	%rax, PBlockLen(%arg2)
738	jmp	_encode_done_\@
739_partial_incomplete_2_\@:
740	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
741_encode_done_\@:
742	movdqu	\AAD_HASH, AadHash(%arg2)
743
744	movdqa	SHUF_MASK(%rip), %xmm10
745	# shuffle xmm9 back to output as ciphertext
746	pshufb	%xmm10, %xmm9
747	pshufb	%xmm2, %xmm9
748.endif
749	# output encrypted Bytes
750	cmp	$0, %r10
751	jl	_partial_fill_\@
752	mov	%r13, %r12
753	mov	$16, %r13
754	# Set r13 to be the number of bytes to write out
755	sub	%r12, %r13
756	jmp	_count_set_\@
757_partial_fill_\@:
758	mov	\PLAIN_CYPH_LEN, %r13
759_count_set_\@:
760	movdqa	%xmm9, %xmm0
761	movq	%xmm0, %rax
762	cmp	$8, %r13
763	jle	_less_than_8_bytes_left_\@
764
765	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
766	add	$8, \DATA_OFFSET
767	psrldq	$8, %xmm0
768	movq	%xmm0, %rax
769	sub	$8, %r13
770_less_than_8_bytes_left_\@:
771	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
772	add	$1, \DATA_OFFSET
773	shr	$8, %rax
774	sub	$1, %r13
775	jne	_less_than_8_bytes_left_\@
776_partial_block_done_\@:
777.endm # PARTIAL_BLOCK
778
779/*
780* if a = number of total plaintext bytes
781* b = floor(a/16)
782* num_initial_blocks = b mod 4
783* encrypt the initial num_initial_blocks blocks and apply ghash on
784* the ciphertext
785* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
786* are clobbered
787* arg1, %arg2, %arg3 are used as a pointer only, not modified
788*/
789
790
791.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
792	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
793	MOVADQ		SHUF_MASK(%rip), %xmm14
794
795	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
796
797	# start AES for num_initial_blocks blocks
798
799	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
800
801.if (\i == 5) || (\i == 6) || (\i == 7)
802
803	MOVADQ		ONE(%RIP),\TMP1
804	MOVADQ		0(%arg1),\TMP2
805.irpc index, \i_seq
806	paddd		\TMP1, \XMM0                 # INCR Y0
807.ifc \operation, dec
808        movdqa     \XMM0, %xmm\index
809.else
810	MOVADQ		\XMM0, %xmm\index
811.endif
812	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
813	pxor		\TMP2, %xmm\index
814.endr
815	lea	0x10(%arg1),%r10
816	mov	keysize,%eax
817	shr	$2,%eax				# 128->4, 192->6, 256->8
818	add	$5,%eax			      # 128->9, 192->11, 256->13
819
820aes_loop_initial_\@:
821	MOVADQ	(%r10),\TMP1
822.irpc	index, \i_seq
823	aesenc	\TMP1, %xmm\index
824.endr
825	add	$16,%r10
826	sub	$1,%eax
827	jnz	aes_loop_initial_\@
828
829	MOVADQ	(%r10), \TMP1
830.irpc index, \i_seq
831	aesenclast \TMP1, %xmm\index         # Last Round
832.endr
833.irpc index, \i_seq
834	movdqu	   (%arg4 , %r11, 1), \TMP1
835	pxor	   \TMP1, %xmm\index
836	movdqu	   %xmm\index, (%arg3 , %r11, 1)
837	# write back plaintext/ciphertext for num_initial_blocks
838	add	   $16, %r11
839
840.ifc \operation, dec
841	movdqa     \TMP1, %xmm\index
842.endif
843	pshufb	   %xmm14, %xmm\index
844
845		# prepare plaintext/ciphertext for GHASH computation
846.endr
847.endif
848
849        # apply GHASH on num_initial_blocks blocks
850
851.if \i == 5
852        pxor       %xmm5, %xmm6
853	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854        pxor       %xmm6, %xmm7
855	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856        pxor       %xmm7, %xmm8
857	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858.elseif \i == 6
859        pxor       %xmm6, %xmm7
860	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861        pxor       %xmm7, %xmm8
862	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 7
864        pxor       %xmm7, %xmm8
865	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866.endif
867	cmp	   $64, %r13
868	jl	_initial_blocks_done\@
869	# no need for precomputed values
870/*
871*
872* Precomputations for HashKey parallel with encryption of first 4 blocks.
873* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
874*/
875	MOVADQ	   ONE(%RIP),\TMP1
876	paddd	   \TMP1, \XMM0              # INCR Y0
877	MOVADQ	   \XMM0, \XMM1
878	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
879
880	paddd	   \TMP1, \XMM0              # INCR Y0
881	MOVADQ	   \XMM0, \XMM2
882	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
883
884	paddd	   \TMP1, \XMM0              # INCR Y0
885	MOVADQ	   \XMM0, \XMM3
886	pshufb %xmm14, \XMM3        # perform a 16 byte swap
887
888	paddd	   \TMP1, \XMM0              # INCR Y0
889	MOVADQ	   \XMM0, \XMM4
890	pshufb %xmm14, \XMM4        # perform a 16 byte swap
891
892	MOVADQ	   0(%arg1),\TMP1
893	pxor	   \TMP1, \XMM1
894	pxor	   \TMP1, \XMM2
895	pxor	   \TMP1, \XMM3
896	pxor	   \TMP1, \XMM4
897.irpc index, 1234 # do 4 rounds
898	movaps 0x10*\index(%arg1), \TMP1
899	aesenc	   \TMP1, \XMM1
900	aesenc	   \TMP1, \XMM2
901	aesenc	   \TMP1, \XMM3
902	aesenc	   \TMP1, \XMM4
903.endr
904.irpc index, 56789 # do next 5 rounds
905	movaps 0x10*\index(%arg1), \TMP1
906	aesenc	   \TMP1, \XMM1
907	aesenc	   \TMP1, \XMM2
908	aesenc	   \TMP1, \XMM3
909	aesenc	   \TMP1, \XMM4
910.endr
911	lea	   0xa0(%arg1),%r10
912	mov	   keysize,%eax
913	shr	   $2,%eax			# 128->4, 192->6, 256->8
914	sub	   $4,%eax			# 128->0, 192->2, 256->4
915	jz	   aes_loop_pre_done\@
916
917aes_loop_pre_\@:
918	MOVADQ	   (%r10),\TMP2
919.irpc	index, 1234
920	aesenc	   \TMP2, %xmm\index
921.endr
922	add	   $16,%r10
923	sub	   $1,%eax
924	jnz	   aes_loop_pre_\@
925
926aes_loop_pre_done\@:
927	MOVADQ	   (%r10), \TMP2
928	aesenclast \TMP2, \XMM1
929	aesenclast \TMP2, \XMM2
930	aesenclast \TMP2, \XMM3
931	aesenclast \TMP2, \XMM4
932	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
933	pxor	   \TMP1, \XMM1
934.ifc \operation, dec
935	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
936	movdqa     \TMP1, \XMM1
937.endif
938	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
939	pxor	   \TMP1, \XMM2
940.ifc \operation, dec
941	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
942	movdqa     \TMP1, \XMM2
943.endif
944	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
945	pxor	   \TMP1, \XMM3
946.ifc \operation, dec
947	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
948	movdqa     \TMP1, \XMM3
949.endif
950	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
951	pxor	   \TMP1, \XMM4
952.ifc \operation, dec
953	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
954	movdqa     \TMP1, \XMM4
955.else
956	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
957	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
958	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
959	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
960.endif
961
962	add	   $64, %r11
963	pshufb %xmm14, \XMM1 # perform a 16 byte swap
964	pxor	   \XMMDst, \XMM1
965# combine GHASHed value with the corresponding ciphertext
966	pshufb %xmm14, \XMM2 # perform a 16 byte swap
967	pshufb %xmm14, \XMM3 # perform a 16 byte swap
968	pshufb %xmm14, \XMM4 # perform a 16 byte swap
969
970_initial_blocks_done\@:
971
972.endm
973
974/*
975* encrypt 4 blocks at a time
976* ghash the 4 previously encrypted ciphertext blocks
977* arg1, %arg3, %arg4 are used as pointers only, not modified
978* %r11 is the data offset value
979*/
980.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
981TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
982
983	movdqa	  \XMM1, \XMM5
984	movdqa	  \XMM2, \XMM6
985	movdqa	  \XMM3, \XMM7
986	movdqa	  \XMM4, \XMM8
987
988        movdqa    SHUF_MASK(%rip), %xmm15
989        # multiply TMP5 * HashKey using karatsuba
990
991	movdqa	  \XMM5, \TMP4
992	pshufd	  $78, \XMM5, \TMP6
993	pxor	  \XMM5, \TMP6
994	paddd     ONE(%rip), \XMM0		# INCR CNT
995	movdqu	  HashKey_4(%arg2), \TMP5
996	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
997	movdqa    \XMM0, \XMM1
998	paddd     ONE(%rip), \XMM0		# INCR CNT
999	movdqa    \XMM0, \XMM2
1000	paddd     ONE(%rip), \XMM0		# INCR CNT
1001	movdqa    \XMM0, \XMM3
1002	paddd     ONE(%rip), \XMM0		# INCR CNT
1003	movdqa    \XMM0, \XMM4
1004	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1005	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1006	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1007	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1008	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1009
1010	pxor	  (%arg1), \XMM1
1011	pxor	  (%arg1), \XMM2
1012	pxor	  (%arg1), \XMM3
1013	pxor	  (%arg1), \XMM4
1014	movdqu	  HashKey_4_k(%arg2), \TMP5
1015	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1016	movaps 0x10(%arg1), \TMP1
1017	aesenc	  \TMP1, \XMM1              # Round 1
1018	aesenc	  \TMP1, \XMM2
1019	aesenc	  \TMP1, \XMM3
1020	aesenc	  \TMP1, \XMM4
1021	movaps 0x20(%arg1), \TMP1
1022	aesenc	  \TMP1, \XMM1              # Round 2
1023	aesenc	  \TMP1, \XMM2
1024	aesenc	  \TMP1, \XMM3
1025	aesenc	  \TMP1, \XMM4
1026	movdqa	  \XMM6, \TMP1
1027	pshufd	  $78, \XMM6, \TMP2
1028	pxor	  \XMM6, \TMP2
1029	movdqu	  HashKey_3(%arg2), \TMP5
1030	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1031	movaps 0x30(%arg1), \TMP3
1032	aesenc    \TMP3, \XMM1              # Round 3
1033	aesenc    \TMP3, \XMM2
1034	aesenc    \TMP3, \XMM3
1035	aesenc    \TMP3, \XMM4
1036	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1037	movaps 0x40(%arg1), \TMP3
1038	aesenc	  \TMP3, \XMM1              # Round 4
1039	aesenc	  \TMP3, \XMM2
1040	aesenc	  \TMP3, \XMM3
1041	aesenc	  \TMP3, \XMM4
1042	movdqu	  HashKey_3_k(%arg2), \TMP5
1043	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1044	movaps 0x50(%arg1), \TMP3
1045	aesenc	  \TMP3, \XMM1              # Round 5
1046	aesenc	  \TMP3, \XMM2
1047	aesenc	  \TMP3, \XMM3
1048	aesenc	  \TMP3, \XMM4
1049	pxor	  \TMP1, \TMP4
1050# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051	pxor	  \XMM6, \XMM5
1052	pxor	  \TMP2, \TMP6
1053	movdqa	  \XMM7, \TMP1
1054	pshufd	  $78, \XMM7, \TMP2
1055	pxor	  \XMM7, \TMP2
1056	movdqu	  HashKey_2(%arg2), \TMP5
1057
1058        # Multiply TMP5 * HashKey using karatsuba
1059
1060	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1061	movaps 0x60(%arg1), \TMP3
1062	aesenc	  \TMP3, \XMM1              # Round 6
1063	aesenc	  \TMP3, \XMM2
1064	aesenc	  \TMP3, \XMM3
1065	aesenc	  \TMP3, \XMM4
1066	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1067	movaps 0x70(%arg1), \TMP3
1068	aesenc	  \TMP3, \XMM1              # Round 7
1069	aesenc	  \TMP3, \XMM2
1070	aesenc	  \TMP3, \XMM3
1071	aesenc	  \TMP3, \XMM4
1072	movdqu	  HashKey_2_k(%arg2), \TMP5
1073	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1074	movaps 0x80(%arg1), \TMP3
1075	aesenc	  \TMP3, \XMM1              # Round 8
1076	aesenc	  \TMP3, \XMM2
1077	aesenc	  \TMP3, \XMM3
1078	aesenc	  \TMP3, \XMM4
1079	pxor	  \TMP1, \TMP4
1080# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081	pxor	  \XMM7, \XMM5
1082	pxor	  \TMP2, \TMP6
1083
1084        # Multiply XMM8 * HashKey
1085        # XMM8 and TMP5 hold the values for the two operands
1086
1087	movdqa	  \XMM8, \TMP1
1088	pshufd	  $78, \XMM8, \TMP2
1089	pxor	  \XMM8, \TMP2
1090	movdqu	  HashKey(%arg2), \TMP5
1091	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1092	movaps 0x90(%arg1), \TMP3
1093	aesenc	  \TMP3, \XMM1             # Round 9
1094	aesenc	  \TMP3, \XMM2
1095	aesenc	  \TMP3, \XMM3
1096	aesenc	  \TMP3, \XMM4
1097	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1098	lea	  0xa0(%arg1),%r10
1099	mov	  keysize,%eax
1100	shr	  $2,%eax			# 128->4, 192->6, 256->8
1101	sub	  $4,%eax			# 128->0, 192->2, 256->4
1102	jz	  aes_loop_par_enc_done\@
1103
1104aes_loop_par_enc\@:
1105	MOVADQ	  (%r10),\TMP3
1106.irpc	index, 1234
1107	aesenc	  \TMP3, %xmm\index
1108.endr
1109	add	  $16,%r10
1110	sub	  $1,%eax
1111	jnz	  aes_loop_par_enc\@
1112
1113aes_loop_par_enc_done\@:
1114	MOVADQ	  (%r10), \TMP3
1115	aesenclast \TMP3, \XMM1           # Round 10
1116	aesenclast \TMP3, \XMM2
1117	aesenclast \TMP3, \XMM3
1118	aesenclast \TMP3, \XMM4
1119	movdqu    HashKey_k(%arg2), \TMP5
1120	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1121	movdqu	  (%arg4,%r11,1), \TMP3
1122	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1123	movdqu	  16(%arg4,%r11,1), \TMP3
1124	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1125	movdqu	  32(%arg4,%r11,1), \TMP3
1126	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1127	movdqu	  48(%arg4,%r11,1), \TMP3
1128	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1129        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1130        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1131        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1132        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1133	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1134	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1135	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1136	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1137
1138	pxor	  \TMP4, \TMP1
1139	pxor	  \XMM8, \XMM5
1140	pxor	  \TMP6, \TMP2
1141	pxor	  \TMP1, \TMP2
1142	pxor	  \XMM5, \TMP2
1143	movdqa	  \TMP2, \TMP3
1144	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1145	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1146	pxor	  \TMP3, \XMM5
1147	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1148
1149        # first phase of reduction
1150
1151	movdqa    \XMM5, \TMP2
1152	movdqa    \XMM5, \TMP3
1153	movdqa    \XMM5, \TMP4
1154# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155	pslld     $31, \TMP2                   # packed right shift << 31
1156	pslld     $30, \TMP3                   # packed right shift << 30
1157	pslld     $25, \TMP4                   # packed right shift << 25
1158	pxor      \TMP3, \TMP2	               # xor the shifted versions
1159	pxor      \TMP4, \TMP2
1160	movdqa    \TMP2, \TMP5
1161	psrldq    $4, \TMP5                    # right shift T5 1 DW
1162	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1163	pxor      \TMP2, \XMM5
1164
1165        # second phase of reduction
1166
1167	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168	movdqa    \XMM5,\TMP3
1169	movdqa    \XMM5,\TMP4
1170	psrld     $1, \TMP2                    # packed left shift >>1
1171	psrld     $2, \TMP3                    # packed left shift >>2
1172	psrld     $7, \TMP4                    # packed left shift >>7
1173	pxor      \TMP3,\TMP2		       # xor the shifted versions
1174	pxor      \TMP4,\TMP2
1175	pxor      \TMP5, \TMP2
1176	pxor      \TMP2, \XMM5
1177	pxor      \TMP1, \XMM5                 # result is in TMP1
1178
1179	pxor	  \XMM5, \XMM1
1180.endm
1181
1182/*
1183* decrypt 4 blocks at a time
1184* ghash the 4 previously decrypted ciphertext blocks
1185* arg1, %arg3, %arg4 are used as pointers only, not modified
1186* %r11 is the data offset value
1187*/
1188.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190
1191	movdqa	  \XMM1, \XMM5
1192	movdqa	  \XMM2, \XMM6
1193	movdqa	  \XMM3, \XMM7
1194	movdqa	  \XMM4, \XMM8
1195
1196        movdqa    SHUF_MASK(%rip), %xmm15
1197        # multiply TMP5 * HashKey using karatsuba
1198
1199	movdqa	  \XMM5, \TMP4
1200	pshufd	  $78, \XMM5, \TMP6
1201	pxor	  \XMM5, \TMP6
1202	paddd     ONE(%rip), \XMM0		# INCR CNT
1203	movdqu	  HashKey_4(%arg2), \TMP5
1204	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1205	movdqa    \XMM0, \XMM1
1206	paddd     ONE(%rip), \XMM0		# INCR CNT
1207	movdqa    \XMM0, \XMM2
1208	paddd     ONE(%rip), \XMM0		# INCR CNT
1209	movdqa    \XMM0, \XMM3
1210	paddd     ONE(%rip), \XMM0		# INCR CNT
1211	movdqa    \XMM0, \XMM4
1212	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1213	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1214	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1215	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1216	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1217
1218	pxor	  (%arg1), \XMM1
1219	pxor	  (%arg1), \XMM2
1220	pxor	  (%arg1), \XMM3
1221	pxor	  (%arg1), \XMM4
1222	movdqu	  HashKey_4_k(%arg2), \TMP5
1223	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1224	movaps 0x10(%arg1), \TMP1
1225	aesenc	  \TMP1, \XMM1              # Round 1
1226	aesenc	  \TMP1, \XMM2
1227	aesenc	  \TMP1, \XMM3
1228	aesenc	  \TMP1, \XMM4
1229	movaps 0x20(%arg1), \TMP1
1230	aesenc	  \TMP1, \XMM1              # Round 2
1231	aesenc	  \TMP1, \XMM2
1232	aesenc	  \TMP1, \XMM3
1233	aesenc	  \TMP1, \XMM4
1234	movdqa	  \XMM6, \TMP1
1235	pshufd	  $78, \XMM6, \TMP2
1236	pxor	  \XMM6, \TMP2
1237	movdqu	  HashKey_3(%arg2), \TMP5
1238	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1239	movaps 0x30(%arg1), \TMP3
1240	aesenc    \TMP3, \XMM1              # Round 3
1241	aesenc    \TMP3, \XMM2
1242	aesenc    \TMP3, \XMM3
1243	aesenc    \TMP3, \XMM4
1244	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1245	movaps 0x40(%arg1), \TMP3
1246	aesenc	  \TMP3, \XMM1              # Round 4
1247	aesenc	  \TMP3, \XMM2
1248	aesenc	  \TMP3, \XMM3
1249	aesenc	  \TMP3, \XMM4
1250	movdqu	  HashKey_3_k(%arg2), \TMP5
1251	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1252	movaps 0x50(%arg1), \TMP3
1253	aesenc	  \TMP3, \XMM1              # Round 5
1254	aesenc	  \TMP3, \XMM2
1255	aesenc	  \TMP3, \XMM3
1256	aesenc	  \TMP3, \XMM4
1257	pxor	  \TMP1, \TMP4
1258# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259	pxor	  \XMM6, \XMM5
1260	pxor	  \TMP2, \TMP6
1261	movdqa	  \XMM7, \TMP1
1262	pshufd	  $78, \XMM7, \TMP2
1263	pxor	  \XMM7, \TMP2
1264	movdqu	  HashKey_2(%arg2), \TMP5
1265
1266        # Multiply TMP5 * HashKey using karatsuba
1267
1268	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1269	movaps 0x60(%arg1), \TMP3
1270	aesenc	  \TMP3, \XMM1              # Round 6
1271	aesenc	  \TMP3, \XMM2
1272	aesenc	  \TMP3, \XMM3
1273	aesenc	  \TMP3, \XMM4
1274	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1275	movaps 0x70(%arg1), \TMP3
1276	aesenc	  \TMP3, \XMM1              # Round 7
1277	aesenc	  \TMP3, \XMM2
1278	aesenc	  \TMP3, \XMM3
1279	aesenc	  \TMP3, \XMM4
1280	movdqu	  HashKey_2_k(%arg2), \TMP5
1281	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1282	movaps 0x80(%arg1), \TMP3
1283	aesenc	  \TMP3, \XMM1              # Round 8
1284	aesenc	  \TMP3, \XMM2
1285	aesenc	  \TMP3, \XMM3
1286	aesenc	  \TMP3, \XMM4
1287	pxor	  \TMP1, \TMP4
1288# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289	pxor	  \XMM7, \XMM5
1290	pxor	  \TMP2, \TMP6
1291
1292        # Multiply XMM8 * HashKey
1293        # XMM8 and TMP5 hold the values for the two operands
1294
1295	movdqa	  \XMM8, \TMP1
1296	pshufd	  $78, \XMM8, \TMP2
1297	pxor	  \XMM8, \TMP2
1298	movdqu	  HashKey(%arg2), \TMP5
1299	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1300	movaps 0x90(%arg1), \TMP3
1301	aesenc	  \TMP3, \XMM1             # Round 9
1302	aesenc	  \TMP3, \XMM2
1303	aesenc	  \TMP3, \XMM3
1304	aesenc	  \TMP3, \XMM4
1305	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1306	lea	  0xa0(%arg1),%r10
1307	mov	  keysize,%eax
1308	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1309	sub	  $4,%eax			# 128->0, 192->2, 256->4
1310	jz	  aes_loop_par_dec_done\@
1311
1312aes_loop_par_dec\@:
1313	MOVADQ	  (%r10),\TMP3
1314.irpc	index, 1234
1315	aesenc	  \TMP3, %xmm\index
1316.endr
1317	add	  $16,%r10
1318	sub	  $1,%eax
1319	jnz	  aes_loop_par_dec\@
1320
1321aes_loop_par_dec_done\@:
1322	MOVADQ	  (%r10), \TMP3
1323	aesenclast \TMP3, \XMM1           # last round
1324	aesenclast \TMP3, \XMM2
1325	aesenclast \TMP3, \XMM3
1326	aesenclast \TMP3, \XMM4
1327	movdqu    HashKey_k(%arg2), \TMP5
1328	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1329	movdqu	  (%arg4,%r11,1), \TMP3
1330	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1331	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1332	movdqa    \TMP3, \XMM1
1333	movdqu	  16(%arg4,%r11,1), \TMP3
1334	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1335	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1336	movdqa    \TMP3, \XMM2
1337	movdqu	  32(%arg4,%r11,1), \TMP3
1338	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1339	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1340	movdqa    \TMP3, \XMM3
1341	movdqu	  48(%arg4,%r11,1), \TMP3
1342	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1343	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1344	movdqa    \TMP3, \XMM4
1345	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1346	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1347	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1348	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1349
1350	pxor	  \TMP4, \TMP1
1351	pxor	  \XMM8, \XMM5
1352	pxor	  \TMP6, \TMP2
1353	pxor	  \TMP1, \TMP2
1354	pxor	  \XMM5, \TMP2
1355	movdqa	  \TMP2, \TMP3
1356	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1357	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1358	pxor	  \TMP3, \XMM5
1359	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1360
1361        # first phase of reduction
1362
1363	movdqa    \XMM5, \TMP2
1364	movdqa    \XMM5, \TMP3
1365	movdqa    \XMM5, \TMP4
1366# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367	pslld     $31, \TMP2                   # packed right shift << 31
1368	pslld     $30, \TMP3                   # packed right shift << 30
1369	pslld     $25, \TMP4                   # packed right shift << 25
1370	pxor      \TMP3, \TMP2	               # xor the shifted versions
1371	pxor      \TMP4, \TMP2
1372	movdqa    \TMP2, \TMP5
1373	psrldq    $4, \TMP5                    # right shift T5 1 DW
1374	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1375	pxor      \TMP2, \XMM5
1376
1377        # second phase of reduction
1378
1379	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380	movdqa    \XMM5,\TMP3
1381	movdqa    \XMM5,\TMP4
1382	psrld     $1, \TMP2                    # packed left shift >>1
1383	psrld     $2, \TMP3                    # packed left shift >>2
1384	psrld     $7, \TMP4                    # packed left shift >>7
1385	pxor      \TMP3,\TMP2		       # xor the shifted versions
1386	pxor      \TMP4,\TMP2
1387	pxor      \TMP5, \TMP2
1388	pxor      \TMP2, \XMM5
1389	pxor      \TMP1, \XMM5                 # result is in TMP1
1390
1391	pxor	  \XMM5, \XMM1
1392.endm
1393
1394/* GHASH the last 4 ciphertext blocks. */
1395.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397
1398        # Multiply TMP6 * HashKey (using Karatsuba)
1399
1400	movdqa	  \XMM1, \TMP6
1401	pshufd	  $78, \XMM1, \TMP2
1402	pxor	  \XMM1, \TMP2
1403	movdqu	  HashKey_4(%arg2), \TMP5
1404	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1405	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1406	movdqu	  HashKey_4_k(%arg2), \TMP4
1407	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1408	movdqa	  \XMM1, \XMMDst
1409	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1410
1411        # Multiply TMP1 * HashKey (using Karatsuba)
1412
1413	movdqa	  \XMM2, \TMP1
1414	pshufd	  $78, \XMM2, \TMP2
1415	pxor	  \XMM2, \TMP2
1416	movdqu	  HashKey_3(%arg2), \TMP5
1417	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1418	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1419	movdqu	  HashKey_3_k(%arg2), \TMP4
1420	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1421	pxor	  \TMP1, \TMP6
1422	pxor	  \XMM2, \XMMDst
1423	pxor	  \TMP2, \XMM1
1424# results accumulated in TMP6, XMMDst, XMM1
1425
1426        # Multiply TMP1 * HashKey (using Karatsuba)
1427
1428	movdqa	  \XMM3, \TMP1
1429	pshufd	  $78, \XMM3, \TMP2
1430	pxor	  \XMM3, \TMP2
1431	movdqu	  HashKey_2(%arg2), \TMP5
1432	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1433	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1434	movdqu	  HashKey_2_k(%arg2), \TMP4
1435	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1436	pxor	  \TMP1, \TMP6
1437	pxor	  \XMM3, \XMMDst
1438	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1439
1440        # Multiply TMP1 * HashKey (using Karatsuba)
1441	movdqa	  \XMM4, \TMP1
1442	pshufd	  $78, \XMM4, \TMP2
1443	pxor	  \XMM4, \TMP2
1444	movdqu	  HashKey(%arg2), \TMP5
1445	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1446	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1447	movdqu	  HashKey_k(%arg2), \TMP4
1448	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1449	pxor	  \TMP1, \TMP6
1450	pxor	  \XMM4, \XMMDst
1451	pxor	  \XMM1, \TMP2
1452	pxor	  \TMP6, \TMP2
1453	pxor	  \XMMDst, \TMP2
1454	# middle section of the temp results combined as in karatsuba algorithm
1455	movdqa	  \TMP2, \TMP4
1456	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1457	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1458	pxor	  \TMP4, \XMMDst
1459	pxor	  \TMP2, \TMP6
1460# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461	# first phase of the reduction
1462	movdqa    \XMMDst, \TMP2
1463	movdqa    \XMMDst, \TMP3
1464	movdqa    \XMMDst, \TMP4
1465# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466	pslld     $31, \TMP2                # packed right shifting << 31
1467	pslld     $30, \TMP3                # packed right shifting << 30
1468	pslld     $25, \TMP4                # packed right shifting << 25
1469	pxor      \TMP3, \TMP2              # xor the shifted versions
1470	pxor      \TMP4, \TMP2
1471	movdqa    \TMP2, \TMP7
1472	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1473	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1474	pxor      \TMP2, \XMMDst
1475
1476        # second phase of the reduction
1477	movdqa    \XMMDst, \TMP2
1478	# make 3 copies of XMMDst for doing 3 shift operations
1479	movdqa    \XMMDst, \TMP3
1480	movdqa    \XMMDst, \TMP4
1481	psrld     $1, \TMP2                 # packed left shift >> 1
1482	psrld     $2, \TMP3                 # packed left shift >> 2
1483	psrld     $7, \TMP4                 # packed left shift >> 7
1484	pxor      \TMP3, \TMP2              # xor the shifted versions
1485	pxor      \TMP4, \TMP2
1486	pxor      \TMP7, \TMP2
1487	pxor      \TMP2, \XMMDst
1488	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1489.endm
1490
1491
1492/* Encryption of a single block
1493* uses eax & r10
1494*/
1495
1496.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497
1498	pxor		(%arg1), \XMM0
1499	mov		keysize,%eax
1500	shr		$2,%eax			# 128->4, 192->6, 256->8
1501	add		$5,%eax			# 128->9, 192->11, 256->13
1502	lea		16(%arg1), %r10	  # get first expanded key address
1503
1504_esb_loop_\@:
1505	MOVADQ		(%r10),\TMP1
1506	aesenc		\TMP1,\XMM0
1507	add		$16,%r10
1508	sub		$1,%eax
1509	jnz		_esb_loop_\@
1510
1511	MOVADQ		(%r10),\TMP1
1512	aesenclast	\TMP1,\XMM0
1513.endm
1514/*****************************************************************************
1515* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1516*                   struct gcm_context_data *data
1517*                                      // Context data
1518*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1519*                   const u8 *in,      // Ciphertext input
1520*                   u64 plaintext_len, // Length of data in bytes for decryption.
1521*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1522*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1524*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525*                   const u8 *aad,     // Additional Authentication Data (AAD)
1526*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1528*                                      // given authentication tag and only return the plaintext if they match.
1529*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530*                                      // (most likely), 12 or 8.
1531*
1532* Assumptions:
1533*
1534* keys:
1535*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1536*       set of 11 keys in the data structure void *aes_ctx
1537*
1538* iv:
1539*       0                   1                   2                   3
1540*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542*       |                             Salt  (From the SA)               |
1543*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544*       |                     Initialization Vector                     |
1545*       |         (This is the sequence number from IPSec header)       |
1546*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547*       |                              0x1                              |
1548*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*
1550*
1551*
1552* AAD:
1553*       AAD padded to 128 bits with 0
1554*       for example, assume AAD is a u32 vector
1555*
1556*       if AAD is 8 bytes:
1557*       AAD[3] = {A0, A1};
1558*       padded AAD in xmm register = {A1 A0 0 0}
1559*
1560*       0                   1                   2                   3
1561*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*       |                               SPI (A1)                        |
1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*       |                     32-bit Sequence Number (A0)               |
1566*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567*       |                              0x0                              |
1568*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569*
1570*                                       AAD Format with 32-bit Sequence Number
1571*
1572*       if AAD is 12 bytes:
1573*       AAD[3] = {A0, A1, A2};
1574*       padded AAD in xmm register = {A2 A1 A0 0}
1575*
1576*       0                   1                   2                   3
1577*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581*       |                               SPI (A2)                        |
1582*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583*       |                 64-bit Extended Sequence Number {A1,A0}       |
1584*       |                                                               |
1585*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586*       |                              0x0                              |
1587*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*
1589*                        AAD Format with 64-bit Extended Sequence Number
1590*
1591* poly = x^128 + x^127 + x^126 + x^121 + 1
1592*
1593*****************************************************************************/
1594SYM_FUNC_START(aesni_gcm_dec)
1595	FUNC_SAVE
1596
1597	GCM_INIT %arg6, arg7, arg8, arg9
1598	GCM_ENC_DEC dec
1599	GCM_COMPLETE arg10, arg11
1600	FUNC_RESTORE
1601	ret
1602SYM_FUNC_END(aesni_gcm_dec)
1603
1604
1605/*****************************************************************************
1606* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1607*                    struct gcm_context_data *data
1608*                                        // Context data
1609*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1610*                    const u8 *in,       // Plaintext input
1611*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1612*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1613*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1615*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616*                    const u8 *aad,      // Additional Authentication Data (AAD)
1617*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618*                    u8 *auth_tag,       // Authenticated Tag output.
1619*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620*                                        // 12 or 8.
1621*
1622* Assumptions:
1623*
1624* keys:
1625*       keys are pre-expanded and aligned to 16 bytes. we are using the
1626*       first set of 11 keys in the data structure void *aes_ctx
1627*
1628*
1629* iv:
1630*       0                   1                   2                   3
1631*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633*       |                             Salt  (From the SA)               |
1634*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635*       |                     Initialization Vector                     |
1636*       |         (This is the sequence number from IPSec header)       |
1637*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638*       |                              0x1                              |
1639*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*
1641*
1642*
1643* AAD:
1644*       AAD padded to 128 bits with 0
1645*       for example, assume AAD is a u32 vector
1646*
1647*       if AAD is 8 bytes:
1648*       AAD[3] = {A0, A1};
1649*       padded AAD in xmm register = {A1 A0 0 0}
1650*
1651*       0                   1                   2                   3
1652*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*       |                               SPI (A1)                        |
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*       |                     32-bit Sequence Number (A0)               |
1657*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658*       |                              0x0                              |
1659*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660*
1661*                                 AAD Format with 32-bit Sequence Number
1662*
1663*       if AAD is 12 bytes:
1664*       AAD[3] = {A0, A1, A2};
1665*       padded AAD in xmm register = {A2 A1 A0 0}
1666*
1667*       0                   1                   2                   3
1668*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670*       |                               SPI (A2)                        |
1671*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672*       |                 64-bit Extended Sequence Number {A1,A0}       |
1673*       |                                                               |
1674*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675*       |                              0x0                              |
1676*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*
1678*                         AAD Format with 64-bit Extended Sequence Number
1679*
1680* poly = x^128 + x^127 + x^126 + x^121 + 1
1681***************************************************************************/
1682SYM_FUNC_START(aesni_gcm_enc)
1683	FUNC_SAVE
1684
1685	GCM_INIT %arg6, arg7, arg8, arg9
1686	GCM_ENC_DEC enc
1687
1688	GCM_COMPLETE arg10, arg11
1689	FUNC_RESTORE
1690	ret
1691SYM_FUNC_END(aesni_gcm_enc)
1692
1693/*****************************************************************************
1694* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1695*                     struct gcm_context_data *data,
1696*                                         // context data
1697*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1698*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1700*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701*                     const u8 *aad,      // Additional Authentication Data (AAD)
1702*                     u64 aad_len)        // Length of AAD in bytes.
1703*/
1704SYM_FUNC_START(aesni_gcm_init)
1705	FUNC_SAVE
1706	GCM_INIT %arg3, %arg4,%arg5, %arg6
1707	FUNC_RESTORE
1708	ret
1709SYM_FUNC_END(aesni_gcm_init)
1710
1711/*****************************************************************************
1712* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1713*                    struct gcm_context_data *data,
1714*                                        // context data
1715*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1716*                    const u8 *in,       // Plaintext input
1717*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1718*/
1719SYM_FUNC_START(aesni_gcm_enc_update)
1720	FUNC_SAVE
1721	GCM_ENC_DEC enc
1722	FUNC_RESTORE
1723	ret
1724SYM_FUNC_END(aesni_gcm_enc_update)
1725
1726/*****************************************************************************
1727* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1728*                    struct gcm_context_data *data,
1729*                                        // context data
1730*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1731*                    const u8 *in,       // Plaintext input
1732*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1733*/
1734SYM_FUNC_START(aesni_gcm_dec_update)
1735	FUNC_SAVE
1736	GCM_ENC_DEC dec
1737	FUNC_RESTORE
1738	ret
1739SYM_FUNC_END(aesni_gcm_dec_update)
1740
1741/*****************************************************************************
1742* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1743*                    struct gcm_context_data *data,
1744*                                        // context data
1745*                    u8 *auth_tag,       // Authenticated Tag output.
1746*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747*                                        // 12 or 8.
1748*/
1749SYM_FUNC_START(aesni_gcm_finalize)
1750	FUNC_SAVE
1751	GCM_COMPLETE %arg3 %arg4
1752	FUNC_RESTORE
1753	ret
1754SYM_FUNC_END(aesni_gcm_finalize)
1755
1756#endif
1757
1758
1759SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760SYM_FUNC_START_LOCAL(_key_expansion_256a)
1761	pshufd $0b11111111, %xmm1, %xmm1
1762	shufps $0b00010000, %xmm0, %xmm4
1763	pxor %xmm4, %xmm0
1764	shufps $0b10001100, %xmm0, %xmm4
1765	pxor %xmm4, %xmm0
1766	pxor %xmm1, %xmm0
1767	movaps %xmm0, (TKEYP)
1768	add $0x10, TKEYP
1769	ret
1770SYM_FUNC_END(_key_expansion_256a)
1771SYM_FUNC_END_ALIAS(_key_expansion_128)
1772
1773SYM_FUNC_START_LOCAL(_key_expansion_192a)
1774	pshufd $0b01010101, %xmm1, %xmm1
1775	shufps $0b00010000, %xmm0, %xmm4
1776	pxor %xmm4, %xmm0
1777	shufps $0b10001100, %xmm0, %xmm4
1778	pxor %xmm4, %xmm0
1779	pxor %xmm1, %xmm0
1780
1781	movaps %xmm2, %xmm5
1782	movaps %xmm2, %xmm6
1783	pslldq $4, %xmm5
1784	pshufd $0b11111111, %xmm0, %xmm3
1785	pxor %xmm3, %xmm2
1786	pxor %xmm5, %xmm2
1787
1788	movaps %xmm0, %xmm1
1789	shufps $0b01000100, %xmm0, %xmm6
1790	movaps %xmm6, (TKEYP)
1791	shufps $0b01001110, %xmm2, %xmm1
1792	movaps %xmm1, 0x10(TKEYP)
1793	add $0x20, TKEYP
1794	ret
1795SYM_FUNC_END(_key_expansion_192a)
1796
1797SYM_FUNC_START_LOCAL(_key_expansion_192b)
1798	pshufd $0b01010101, %xmm1, %xmm1
1799	shufps $0b00010000, %xmm0, %xmm4
1800	pxor %xmm4, %xmm0
1801	shufps $0b10001100, %xmm0, %xmm4
1802	pxor %xmm4, %xmm0
1803	pxor %xmm1, %xmm0
1804
1805	movaps %xmm2, %xmm5
1806	pslldq $4, %xmm5
1807	pshufd $0b11111111, %xmm0, %xmm3
1808	pxor %xmm3, %xmm2
1809	pxor %xmm5, %xmm2
1810
1811	movaps %xmm0, (TKEYP)
1812	add $0x10, TKEYP
1813	ret
1814SYM_FUNC_END(_key_expansion_192b)
1815
1816SYM_FUNC_START_LOCAL(_key_expansion_256b)
1817	pshufd $0b10101010, %xmm1, %xmm1
1818	shufps $0b00010000, %xmm2, %xmm4
1819	pxor %xmm4, %xmm2
1820	shufps $0b10001100, %xmm2, %xmm4
1821	pxor %xmm4, %xmm2
1822	pxor %xmm1, %xmm2
1823	movaps %xmm2, (TKEYP)
1824	add $0x10, TKEYP
1825	ret
1826SYM_FUNC_END(_key_expansion_256b)
1827
1828/*
1829 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830 *                   unsigned int key_len)
1831 */
1832SYM_FUNC_START(aesni_set_key)
1833	FRAME_BEGIN
1834#ifndef __x86_64__
1835	pushl KEYP
1836	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1837	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1838	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1839#endif
1840	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1841	movaps %xmm0, (KEYP)
1842	lea 0x10(KEYP), TKEYP		# key addr
1843	movl %edx, 480(KEYP)
1844	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1845	cmp $24, %dl
1846	jb .Lenc_key128
1847	je .Lenc_key192
1848	movups 0x10(UKEYP), %xmm2	# other user key
1849	movaps %xmm2, (TKEYP)
1850	add $0x10, TKEYP
1851	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1852	call _key_expansion_256a
1853	aeskeygenassist $0x1, %xmm0, %xmm1
1854	call _key_expansion_256b
1855	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1856	call _key_expansion_256a
1857	aeskeygenassist $0x2, %xmm0, %xmm1
1858	call _key_expansion_256b
1859	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1860	call _key_expansion_256a
1861	aeskeygenassist $0x4, %xmm0, %xmm1
1862	call _key_expansion_256b
1863	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1864	call _key_expansion_256a
1865	aeskeygenassist $0x8, %xmm0, %xmm1
1866	call _key_expansion_256b
1867	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1868	call _key_expansion_256a
1869	aeskeygenassist $0x10, %xmm0, %xmm1
1870	call _key_expansion_256b
1871	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1872	call _key_expansion_256a
1873	aeskeygenassist $0x20, %xmm0, %xmm1
1874	call _key_expansion_256b
1875	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1876	call _key_expansion_256a
1877	jmp .Ldec_key
1878.Lenc_key192:
1879	movq 0x10(UKEYP), %xmm2		# other user key
1880	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1881	call _key_expansion_192a
1882	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1883	call _key_expansion_192b
1884	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1885	call _key_expansion_192a
1886	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1887	call _key_expansion_192b
1888	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1889	call _key_expansion_192a
1890	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1891	call _key_expansion_192b
1892	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1893	call _key_expansion_192a
1894	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1895	call _key_expansion_192b
1896	jmp .Ldec_key
1897.Lenc_key128:
1898	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1899	call _key_expansion_128
1900	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1901	call _key_expansion_128
1902	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1903	call _key_expansion_128
1904	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1905	call _key_expansion_128
1906	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1907	call _key_expansion_128
1908	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1909	call _key_expansion_128
1910	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1911	call _key_expansion_128
1912	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1913	call _key_expansion_128
1914	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1915	call _key_expansion_128
1916	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1917	call _key_expansion_128
1918.Ldec_key:
1919	sub $0x10, TKEYP
1920	movaps (KEYP), %xmm0
1921	movaps (TKEYP), %xmm1
1922	movaps %xmm0, 240(TKEYP)
1923	movaps %xmm1, 240(KEYP)
1924	add $0x10, KEYP
1925	lea 240-16(TKEYP), UKEYP
1926.align 4
1927.Ldec_key_loop:
1928	movaps (KEYP), %xmm0
1929	aesimc %xmm0, %xmm1
1930	movaps %xmm1, (UKEYP)
1931	add $0x10, KEYP
1932	sub $0x10, UKEYP
1933	cmp TKEYP, KEYP
1934	jb .Ldec_key_loop
1935	xor AREG, AREG
1936#ifndef __x86_64__
1937	popl KEYP
1938#endif
1939	FRAME_END
1940	ret
1941SYM_FUNC_END(aesni_set_key)
1942
1943/*
1944 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1945 */
1946SYM_FUNC_START(aesni_enc)
1947	FRAME_BEGIN
1948#ifndef __x86_64__
1949	pushl KEYP
1950	pushl KLEN
1951	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1952	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1953	movl (FRAME_OFFSET+20)(%esp), INP	# src
1954#endif
1955	movl 480(KEYP), KLEN		# key length
1956	movups (INP), STATE		# input
1957	call _aesni_enc1
1958	movups STATE, (OUTP)		# output
1959#ifndef __x86_64__
1960	popl KLEN
1961	popl KEYP
1962#endif
1963	FRAME_END
1964	ret
1965SYM_FUNC_END(aesni_enc)
1966
1967/*
1968 * _aesni_enc1:		internal ABI
1969 * input:
1970 *	KEYP:		key struct pointer
1971 *	KLEN:		round count
1972 *	STATE:		initial state (input)
1973 * output:
1974 *	STATE:		finial state (output)
1975 * changed:
1976 *	KEY
1977 *	TKEYP (T1)
1978 */
1979SYM_FUNC_START_LOCAL(_aesni_enc1)
1980	movaps (KEYP), KEY		# key
1981	mov KEYP, TKEYP
1982	pxor KEY, STATE		# round 0
1983	add $0x30, TKEYP
1984	cmp $24, KLEN
1985	jb .Lenc128
1986	lea 0x20(TKEYP), TKEYP
1987	je .Lenc192
1988	add $0x20, TKEYP
1989	movaps -0x60(TKEYP), KEY
1990	aesenc KEY, STATE
1991	movaps -0x50(TKEYP), KEY
1992	aesenc KEY, STATE
1993.align 4
1994.Lenc192:
1995	movaps -0x40(TKEYP), KEY
1996	aesenc KEY, STATE
1997	movaps -0x30(TKEYP), KEY
1998	aesenc KEY, STATE
1999.align 4
2000.Lenc128:
2001	movaps -0x20(TKEYP), KEY
2002	aesenc KEY, STATE
2003	movaps -0x10(TKEYP), KEY
2004	aesenc KEY, STATE
2005	movaps (TKEYP), KEY
2006	aesenc KEY, STATE
2007	movaps 0x10(TKEYP), KEY
2008	aesenc KEY, STATE
2009	movaps 0x20(TKEYP), KEY
2010	aesenc KEY, STATE
2011	movaps 0x30(TKEYP), KEY
2012	aesenc KEY, STATE
2013	movaps 0x40(TKEYP), KEY
2014	aesenc KEY, STATE
2015	movaps 0x50(TKEYP), KEY
2016	aesenc KEY, STATE
2017	movaps 0x60(TKEYP), KEY
2018	aesenc KEY, STATE
2019	movaps 0x70(TKEYP), KEY
2020	aesenclast KEY, STATE
2021	ret
2022SYM_FUNC_END(_aesni_enc1)
2023
2024/*
2025 * _aesni_enc4:	internal ABI
2026 * input:
2027 *	KEYP:		key struct pointer
2028 *	KLEN:		round count
2029 *	STATE1:		initial state (input)
2030 *	STATE2
2031 *	STATE3
2032 *	STATE4
2033 * output:
2034 *	STATE1:		finial state (output)
2035 *	STATE2
2036 *	STATE3
2037 *	STATE4
2038 * changed:
2039 *	KEY
2040 *	TKEYP (T1)
2041 */
2042SYM_FUNC_START_LOCAL(_aesni_enc4)
2043	movaps (KEYP), KEY		# key
2044	mov KEYP, TKEYP
2045	pxor KEY, STATE1		# round 0
2046	pxor KEY, STATE2
2047	pxor KEY, STATE3
2048	pxor KEY, STATE4
2049	add $0x30, TKEYP
2050	cmp $24, KLEN
2051	jb .L4enc128
2052	lea 0x20(TKEYP), TKEYP
2053	je .L4enc192
2054	add $0x20, TKEYP
2055	movaps -0x60(TKEYP), KEY
2056	aesenc KEY, STATE1
2057	aesenc KEY, STATE2
2058	aesenc KEY, STATE3
2059	aesenc KEY, STATE4
2060	movaps -0x50(TKEYP), KEY
2061	aesenc KEY, STATE1
2062	aesenc KEY, STATE2
2063	aesenc KEY, STATE3
2064	aesenc KEY, STATE4
2065#.align 4
2066.L4enc192:
2067	movaps -0x40(TKEYP), KEY
2068	aesenc KEY, STATE1
2069	aesenc KEY, STATE2
2070	aesenc KEY, STATE3
2071	aesenc KEY, STATE4
2072	movaps -0x30(TKEYP), KEY
2073	aesenc KEY, STATE1
2074	aesenc KEY, STATE2
2075	aesenc KEY, STATE3
2076	aesenc KEY, STATE4
2077#.align 4
2078.L4enc128:
2079	movaps -0x20(TKEYP), KEY
2080	aesenc KEY, STATE1
2081	aesenc KEY, STATE2
2082	aesenc KEY, STATE3
2083	aesenc KEY, STATE4
2084	movaps -0x10(TKEYP), KEY
2085	aesenc KEY, STATE1
2086	aesenc KEY, STATE2
2087	aesenc KEY, STATE3
2088	aesenc KEY, STATE4
2089	movaps (TKEYP), KEY
2090	aesenc KEY, STATE1
2091	aesenc KEY, STATE2
2092	aesenc KEY, STATE3
2093	aesenc KEY, STATE4
2094	movaps 0x10(TKEYP), KEY
2095	aesenc KEY, STATE1
2096	aesenc KEY, STATE2
2097	aesenc KEY, STATE3
2098	aesenc KEY, STATE4
2099	movaps 0x20(TKEYP), KEY
2100	aesenc KEY, STATE1
2101	aesenc KEY, STATE2
2102	aesenc KEY, STATE3
2103	aesenc KEY, STATE4
2104	movaps 0x30(TKEYP), KEY
2105	aesenc KEY, STATE1
2106	aesenc KEY, STATE2
2107	aesenc KEY, STATE3
2108	aesenc KEY, STATE4
2109	movaps 0x40(TKEYP), KEY
2110	aesenc KEY, STATE1
2111	aesenc KEY, STATE2
2112	aesenc KEY, STATE3
2113	aesenc KEY, STATE4
2114	movaps 0x50(TKEYP), KEY
2115	aesenc KEY, STATE1
2116	aesenc KEY, STATE2
2117	aesenc KEY, STATE3
2118	aesenc KEY, STATE4
2119	movaps 0x60(TKEYP), KEY
2120	aesenc KEY, STATE1
2121	aesenc KEY, STATE2
2122	aesenc KEY, STATE3
2123	aesenc KEY, STATE4
2124	movaps 0x70(TKEYP), KEY
2125	aesenclast KEY, STATE1		# last round
2126	aesenclast KEY, STATE2
2127	aesenclast KEY, STATE3
2128	aesenclast KEY, STATE4
2129	ret
2130SYM_FUNC_END(_aesni_enc4)
2131
2132/*
2133 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2134 */
2135SYM_FUNC_START(aesni_dec)
2136	FRAME_BEGIN
2137#ifndef __x86_64__
2138	pushl KEYP
2139	pushl KLEN
2140	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2141	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2142	movl (FRAME_OFFSET+20)(%esp), INP	# src
2143#endif
2144	mov 480(KEYP), KLEN		# key length
2145	add $240, KEYP
2146	movups (INP), STATE		# input
2147	call _aesni_dec1
2148	movups STATE, (OUTP)		#output
2149#ifndef __x86_64__
2150	popl KLEN
2151	popl KEYP
2152#endif
2153	FRAME_END
2154	ret
2155SYM_FUNC_END(aesni_dec)
2156
2157/*
2158 * _aesni_dec1:		internal ABI
2159 * input:
2160 *	KEYP:		key struct pointer
2161 *	KLEN:		key length
2162 *	STATE:		initial state (input)
2163 * output:
2164 *	STATE:		finial state (output)
2165 * changed:
2166 *	KEY
2167 *	TKEYP (T1)
2168 */
2169SYM_FUNC_START_LOCAL(_aesni_dec1)
2170	movaps (KEYP), KEY		# key
2171	mov KEYP, TKEYP
2172	pxor KEY, STATE		# round 0
2173	add $0x30, TKEYP
2174	cmp $24, KLEN
2175	jb .Ldec128
2176	lea 0x20(TKEYP), TKEYP
2177	je .Ldec192
2178	add $0x20, TKEYP
2179	movaps -0x60(TKEYP), KEY
2180	aesdec KEY, STATE
2181	movaps -0x50(TKEYP), KEY
2182	aesdec KEY, STATE
2183.align 4
2184.Ldec192:
2185	movaps -0x40(TKEYP), KEY
2186	aesdec KEY, STATE
2187	movaps -0x30(TKEYP), KEY
2188	aesdec KEY, STATE
2189.align 4
2190.Ldec128:
2191	movaps -0x20(TKEYP), KEY
2192	aesdec KEY, STATE
2193	movaps -0x10(TKEYP), KEY
2194	aesdec KEY, STATE
2195	movaps (TKEYP), KEY
2196	aesdec KEY, STATE
2197	movaps 0x10(TKEYP), KEY
2198	aesdec KEY, STATE
2199	movaps 0x20(TKEYP), KEY
2200	aesdec KEY, STATE
2201	movaps 0x30(TKEYP), KEY
2202	aesdec KEY, STATE
2203	movaps 0x40(TKEYP), KEY
2204	aesdec KEY, STATE
2205	movaps 0x50(TKEYP), KEY
2206	aesdec KEY, STATE
2207	movaps 0x60(TKEYP), KEY
2208	aesdec KEY, STATE
2209	movaps 0x70(TKEYP), KEY
2210	aesdeclast KEY, STATE
2211	ret
2212SYM_FUNC_END(_aesni_dec1)
2213
2214/*
2215 * _aesni_dec4:	internal ABI
2216 * input:
2217 *	KEYP:		key struct pointer
2218 *	KLEN:		key length
2219 *	STATE1:		initial state (input)
2220 *	STATE2
2221 *	STATE3
2222 *	STATE4
2223 * output:
2224 *	STATE1:		finial state (output)
2225 *	STATE2
2226 *	STATE3
2227 *	STATE4
2228 * changed:
2229 *	KEY
2230 *	TKEYP (T1)
2231 */
2232SYM_FUNC_START_LOCAL(_aesni_dec4)
2233	movaps (KEYP), KEY		# key
2234	mov KEYP, TKEYP
2235	pxor KEY, STATE1		# round 0
2236	pxor KEY, STATE2
2237	pxor KEY, STATE3
2238	pxor KEY, STATE4
2239	add $0x30, TKEYP
2240	cmp $24, KLEN
2241	jb .L4dec128
2242	lea 0x20(TKEYP), TKEYP
2243	je .L4dec192
2244	add $0x20, TKEYP
2245	movaps -0x60(TKEYP), KEY
2246	aesdec KEY, STATE1
2247	aesdec KEY, STATE2
2248	aesdec KEY, STATE3
2249	aesdec KEY, STATE4
2250	movaps -0x50(TKEYP), KEY
2251	aesdec KEY, STATE1
2252	aesdec KEY, STATE2
2253	aesdec KEY, STATE3
2254	aesdec KEY, STATE4
2255.align 4
2256.L4dec192:
2257	movaps -0x40(TKEYP), KEY
2258	aesdec KEY, STATE1
2259	aesdec KEY, STATE2
2260	aesdec KEY, STATE3
2261	aesdec KEY, STATE4
2262	movaps -0x30(TKEYP), KEY
2263	aesdec KEY, STATE1
2264	aesdec KEY, STATE2
2265	aesdec KEY, STATE3
2266	aesdec KEY, STATE4
2267.align 4
2268.L4dec128:
2269	movaps -0x20(TKEYP), KEY
2270	aesdec KEY, STATE1
2271	aesdec KEY, STATE2
2272	aesdec KEY, STATE3
2273	aesdec KEY, STATE4
2274	movaps -0x10(TKEYP), KEY
2275	aesdec KEY, STATE1
2276	aesdec KEY, STATE2
2277	aesdec KEY, STATE3
2278	aesdec KEY, STATE4
2279	movaps (TKEYP), KEY
2280	aesdec KEY, STATE1
2281	aesdec KEY, STATE2
2282	aesdec KEY, STATE3
2283	aesdec KEY, STATE4
2284	movaps 0x10(TKEYP), KEY
2285	aesdec KEY, STATE1
2286	aesdec KEY, STATE2
2287	aesdec KEY, STATE3
2288	aesdec KEY, STATE4
2289	movaps 0x20(TKEYP), KEY
2290	aesdec KEY, STATE1
2291	aesdec KEY, STATE2
2292	aesdec KEY, STATE3
2293	aesdec KEY, STATE4
2294	movaps 0x30(TKEYP), KEY
2295	aesdec KEY, STATE1
2296	aesdec KEY, STATE2
2297	aesdec KEY, STATE3
2298	aesdec KEY, STATE4
2299	movaps 0x40(TKEYP), KEY
2300	aesdec KEY, STATE1
2301	aesdec KEY, STATE2
2302	aesdec KEY, STATE3
2303	aesdec KEY, STATE4
2304	movaps 0x50(TKEYP), KEY
2305	aesdec KEY, STATE1
2306	aesdec KEY, STATE2
2307	aesdec KEY, STATE3
2308	aesdec KEY, STATE4
2309	movaps 0x60(TKEYP), KEY
2310	aesdec KEY, STATE1
2311	aesdec KEY, STATE2
2312	aesdec KEY, STATE3
2313	aesdec KEY, STATE4
2314	movaps 0x70(TKEYP), KEY
2315	aesdeclast KEY, STATE1		# last round
2316	aesdeclast KEY, STATE2
2317	aesdeclast KEY, STATE3
2318	aesdeclast KEY, STATE4
2319	ret
2320SYM_FUNC_END(_aesni_dec4)
2321
2322/*
2323 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324 *		      size_t len)
2325 */
2326SYM_FUNC_START(aesni_ecb_enc)
2327	FRAME_BEGIN
2328#ifndef __x86_64__
2329	pushl LEN
2330	pushl KEYP
2331	pushl KLEN
2332	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2333	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2334	movl (FRAME_OFFSET+24)(%esp), INP	# src
2335	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2336#endif
2337	test LEN, LEN		# check length
2338	jz .Lecb_enc_ret
2339	mov 480(KEYP), KLEN
2340	cmp $16, LEN
2341	jb .Lecb_enc_ret
2342	cmp $64, LEN
2343	jb .Lecb_enc_loop1
2344.align 4
2345.Lecb_enc_loop4:
2346	movups (INP), STATE1
2347	movups 0x10(INP), STATE2
2348	movups 0x20(INP), STATE3
2349	movups 0x30(INP), STATE4
2350	call _aesni_enc4
2351	movups STATE1, (OUTP)
2352	movups STATE2, 0x10(OUTP)
2353	movups STATE3, 0x20(OUTP)
2354	movups STATE4, 0x30(OUTP)
2355	sub $64, LEN
2356	add $64, INP
2357	add $64, OUTP
2358	cmp $64, LEN
2359	jge .Lecb_enc_loop4
2360	cmp $16, LEN
2361	jb .Lecb_enc_ret
2362.align 4
2363.Lecb_enc_loop1:
2364	movups (INP), STATE1
2365	call _aesni_enc1
2366	movups STATE1, (OUTP)
2367	sub $16, LEN
2368	add $16, INP
2369	add $16, OUTP
2370	cmp $16, LEN
2371	jge .Lecb_enc_loop1
2372.Lecb_enc_ret:
2373#ifndef __x86_64__
2374	popl KLEN
2375	popl KEYP
2376	popl LEN
2377#endif
2378	FRAME_END
2379	ret
2380SYM_FUNC_END(aesni_ecb_enc)
2381
2382/*
2383 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384 *		      size_t len);
2385 */
2386SYM_FUNC_START(aesni_ecb_dec)
2387	FRAME_BEGIN
2388#ifndef __x86_64__
2389	pushl LEN
2390	pushl KEYP
2391	pushl KLEN
2392	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2393	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2394	movl (FRAME_OFFSET+24)(%esp), INP	# src
2395	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2396#endif
2397	test LEN, LEN
2398	jz .Lecb_dec_ret
2399	mov 480(KEYP), KLEN
2400	add $240, KEYP
2401	cmp $16, LEN
2402	jb .Lecb_dec_ret
2403	cmp $64, LEN
2404	jb .Lecb_dec_loop1
2405.align 4
2406.Lecb_dec_loop4:
2407	movups (INP), STATE1
2408	movups 0x10(INP), STATE2
2409	movups 0x20(INP), STATE3
2410	movups 0x30(INP), STATE4
2411	call _aesni_dec4
2412	movups STATE1, (OUTP)
2413	movups STATE2, 0x10(OUTP)
2414	movups STATE3, 0x20(OUTP)
2415	movups STATE4, 0x30(OUTP)
2416	sub $64, LEN
2417	add $64, INP
2418	add $64, OUTP
2419	cmp $64, LEN
2420	jge .Lecb_dec_loop4
2421	cmp $16, LEN
2422	jb .Lecb_dec_ret
2423.align 4
2424.Lecb_dec_loop1:
2425	movups (INP), STATE1
2426	call _aesni_dec1
2427	movups STATE1, (OUTP)
2428	sub $16, LEN
2429	add $16, INP
2430	add $16, OUTP
2431	cmp $16, LEN
2432	jge .Lecb_dec_loop1
2433.Lecb_dec_ret:
2434#ifndef __x86_64__
2435	popl KLEN
2436	popl KEYP
2437	popl LEN
2438#endif
2439	FRAME_END
2440	ret
2441SYM_FUNC_END(aesni_ecb_dec)
2442
2443/*
2444 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445 *		      size_t len, u8 *iv)
2446 */
2447SYM_FUNC_START(aesni_cbc_enc)
2448	FRAME_BEGIN
2449#ifndef __x86_64__
2450	pushl IVP
2451	pushl LEN
2452	pushl KEYP
2453	pushl KLEN
2454	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2455	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2456	movl (FRAME_OFFSET+28)(%esp), INP	# src
2457	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2458	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2459#endif
2460	cmp $16, LEN
2461	jb .Lcbc_enc_ret
2462	mov 480(KEYP), KLEN
2463	movups (IVP), STATE	# load iv as initial state
2464.align 4
2465.Lcbc_enc_loop:
2466	movups (INP), IN	# load input
2467	pxor IN, STATE
2468	call _aesni_enc1
2469	movups STATE, (OUTP)	# store output
2470	sub $16, LEN
2471	add $16, INP
2472	add $16, OUTP
2473	cmp $16, LEN
2474	jge .Lcbc_enc_loop
2475	movups STATE, (IVP)
2476.Lcbc_enc_ret:
2477#ifndef __x86_64__
2478	popl KLEN
2479	popl KEYP
2480	popl LEN
2481	popl IVP
2482#endif
2483	FRAME_END
2484	ret
2485SYM_FUNC_END(aesni_cbc_enc)
2486
2487/*
2488 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489 *		      size_t len, u8 *iv)
2490 */
2491SYM_FUNC_START(aesni_cbc_dec)
2492	FRAME_BEGIN
2493#ifndef __x86_64__
2494	pushl IVP
2495	pushl LEN
2496	pushl KEYP
2497	pushl KLEN
2498	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2499	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2500	movl (FRAME_OFFSET+28)(%esp), INP	# src
2501	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2502	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2503#endif
2504	cmp $16, LEN
2505	jb .Lcbc_dec_just_ret
2506	mov 480(KEYP), KLEN
2507	add $240, KEYP
2508	movups (IVP), IV
2509	cmp $64, LEN
2510	jb .Lcbc_dec_loop1
2511.align 4
2512.Lcbc_dec_loop4:
2513	movups (INP), IN1
2514	movaps IN1, STATE1
2515	movups 0x10(INP), IN2
2516	movaps IN2, STATE2
2517#ifdef __x86_64__
2518	movups 0x20(INP), IN3
2519	movaps IN3, STATE3
2520	movups 0x30(INP), IN4
2521	movaps IN4, STATE4
2522#else
2523	movups 0x20(INP), IN1
2524	movaps IN1, STATE3
2525	movups 0x30(INP), IN2
2526	movaps IN2, STATE4
2527#endif
2528	call _aesni_dec4
2529	pxor IV, STATE1
2530#ifdef __x86_64__
2531	pxor IN1, STATE2
2532	pxor IN2, STATE3
2533	pxor IN3, STATE4
2534	movaps IN4, IV
2535#else
2536	pxor IN1, STATE4
2537	movaps IN2, IV
2538	movups (INP), IN1
2539	pxor IN1, STATE2
2540	movups 0x10(INP), IN2
2541	pxor IN2, STATE3
2542#endif
2543	movups STATE1, (OUTP)
2544	movups STATE2, 0x10(OUTP)
2545	movups STATE3, 0x20(OUTP)
2546	movups STATE4, 0x30(OUTP)
2547	sub $64, LEN
2548	add $64, INP
2549	add $64, OUTP
2550	cmp $64, LEN
2551	jge .Lcbc_dec_loop4
2552	cmp $16, LEN
2553	jb .Lcbc_dec_ret
2554.align 4
2555.Lcbc_dec_loop1:
2556	movups (INP), IN
2557	movaps IN, STATE
2558	call _aesni_dec1
2559	pxor IV, STATE
2560	movups STATE, (OUTP)
2561	movaps IN, IV
2562	sub $16, LEN
2563	add $16, INP
2564	add $16, OUTP
2565	cmp $16, LEN
2566	jge .Lcbc_dec_loop1
2567.Lcbc_dec_ret:
2568	movups IV, (IVP)
2569.Lcbc_dec_just_ret:
2570#ifndef __x86_64__
2571	popl KLEN
2572	popl KEYP
2573	popl LEN
2574	popl IVP
2575#endif
2576	FRAME_END
2577	ret
2578SYM_FUNC_END(aesni_cbc_dec)
2579
2580#ifdef __x86_64__
2581.pushsection .rodata
2582.align 16
2583.Lbswap_mask:
2584	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2585.popsection
2586
2587/*
2588 * _aesni_inc_init:	internal ABI
2589 *	setup registers used by _aesni_inc
2590 * input:
2591 *	IV
2592 * output:
2593 *	CTR:	== IV, in little endian
2594 *	TCTR_LOW: == lower qword of CTR
2595 *	INC:	== 1, in little endian
2596 *	BSWAP_MASK == endian swapping mask
2597 */
2598SYM_FUNC_START_LOCAL(_aesni_inc_init)
2599	movaps .Lbswap_mask, BSWAP_MASK
2600	movaps IV, CTR
2601	pshufb BSWAP_MASK, CTR
2602	mov $1, TCTR_LOW
2603	movq TCTR_LOW, INC
2604	movq CTR, TCTR_LOW
2605	ret
2606SYM_FUNC_END(_aesni_inc_init)
2607
2608/*
2609 * _aesni_inc:		internal ABI
2610 *	Increase IV by 1, IV is in big endian
2611 * input:
2612 *	IV
2613 *	CTR:	== IV, in little endian
2614 *	TCTR_LOW: == lower qword of CTR
2615 *	INC:	== 1, in little endian
2616 *	BSWAP_MASK == endian swapping mask
2617 * output:
2618 *	IV:	Increase by 1
2619 * changed:
2620 *	CTR:	== output IV, in little endian
2621 *	TCTR_LOW: == lower qword of CTR
2622 */
2623SYM_FUNC_START_LOCAL(_aesni_inc)
2624	paddq INC, CTR
2625	add $1, TCTR_LOW
2626	jnc .Linc_low
2627	pslldq $8, INC
2628	paddq INC, CTR
2629	psrldq $8, INC
2630.Linc_low:
2631	movaps CTR, IV
2632	pshufb BSWAP_MASK, IV
2633	ret
2634SYM_FUNC_END(_aesni_inc)
2635
2636/*
2637 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638 *		      size_t len, u8 *iv)
2639 */
2640SYM_FUNC_START(aesni_ctr_enc)
2641	FRAME_BEGIN
2642	cmp $16, LEN
2643	jb .Lctr_enc_just_ret
2644	mov 480(KEYP), KLEN
2645	movups (IVP), IV
2646	call _aesni_inc_init
2647	cmp $64, LEN
2648	jb .Lctr_enc_loop1
2649.align 4
2650.Lctr_enc_loop4:
2651	movaps IV, STATE1
2652	call _aesni_inc
2653	movups (INP), IN1
2654	movaps IV, STATE2
2655	call _aesni_inc
2656	movups 0x10(INP), IN2
2657	movaps IV, STATE3
2658	call _aesni_inc
2659	movups 0x20(INP), IN3
2660	movaps IV, STATE4
2661	call _aesni_inc
2662	movups 0x30(INP), IN4
2663	call _aesni_enc4
2664	pxor IN1, STATE1
2665	movups STATE1, (OUTP)
2666	pxor IN2, STATE2
2667	movups STATE2, 0x10(OUTP)
2668	pxor IN3, STATE3
2669	movups STATE3, 0x20(OUTP)
2670	pxor IN4, STATE4
2671	movups STATE4, 0x30(OUTP)
2672	sub $64, LEN
2673	add $64, INP
2674	add $64, OUTP
2675	cmp $64, LEN
2676	jge .Lctr_enc_loop4
2677	cmp $16, LEN
2678	jb .Lctr_enc_ret
2679.align 4
2680.Lctr_enc_loop1:
2681	movaps IV, STATE
2682	call _aesni_inc
2683	movups (INP), IN
2684	call _aesni_enc1
2685	pxor IN, STATE
2686	movups STATE, (OUTP)
2687	sub $16, LEN
2688	add $16, INP
2689	add $16, OUTP
2690	cmp $16, LEN
2691	jge .Lctr_enc_loop1
2692.Lctr_enc_ret:
2693	movups IV, (IVP)
2694.Lctr_enc_just_ret:
2695	FRAME_END
2696	ret
2697SYM_FUNC_END(aesni_ctr_enc)
2698
2699/*
2700 * _aesni_gf128mul_x_ble:		internal ABI
2701 *	Multiply in GF(2^128) for XTS IVs
2702 * input:
2703 *	IV:	current IV
2704 *	GF128MUL_MASK == mask with 0x87 and 0x01
2705 * output:
2706 *	IV:	next IV
2707 * changed:
2708 *	CTR:	== temporary value
2709 */
2710#define _aesni_gf128mul_x_ble() \
2711	pshufd $0x13, IV, CTR; \
2712	paddq IV, IV; \
2713	psrad $31, CTR; \
2714	pand GF128MUL_MASK, CTR; \
2715	pxor CTR, IV;
2716
2717/*
2718 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2719 *			 const u8 *src, bool enc, le128 *iv)
2720 */
2721SYM_FUNC_START(aesni_xts_crypt8)
2722	FRAME_BEGIN
2723	cmpb $0, %cl
2724	movl $0, %ecx
2725	movl $240, %r10d
2726	leaq _aesni_enc4, %r11
2727	leaq _aesni_dec4, %rax
2728	cmovel %r10d, %ecx
2729	cmoveq %rax, %r11
2730
2731	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2732	movups (IVP), IV
2733
2734	mov 480(KEYP), KLEN
2735	addq %rcx, KEYP
2736
2737	movdqa IV, STATE1
2738	movdqu 0x00(INP), INC
2739	pxor INC, STATE1
2740	movdqu IV, 0x00(OUTP)
2741
2742	_aesni_gf128mul_x_ble()
2743	movdqa IV, STATE2
2744	movdqu 0x10(INP), INC
2745	pxor INC, STATE2
2746	movdqu IV, 0x10(OUTP)
2747
2748	_aesni_gf128mul_x_ble()
2749	movdqa IV, STATE3
2750	movdqu 0x20(INP), INC
2751	pxor INC, STATE3
2752	movdqu IV, 0x20(OUTP)
2753
2754	_aesni_gf128mul_x_ble()
2755	movdqa IV, STATE4
2756	movdqu 0x30(INP), INC
2757	pxor INC, STATE4
2758	movdqu IV, 0x30(OUTP)
2759
2760	CALL_NOSPEC r11
2761
2762	movdqu 0x00(OUTP), INC
2763	pxor INC, STATE1
2764	movdqu STATE1, 0x00(OUTP)
2765
2766	_aesni_gf128mul_x_ble()
2767	movdqa IV, STATE1
2768	movdqu 0x40(INP), INC
2769	pxor INC, STATE1
2770	movdqu IV, 0x40(OUTP)
2771
2772	movdqu 0x10(OUTP), INC
2773	pxor INC, STATE2
2774	movdqu STATE2, 0x10(OUTP)
2775
2776	_aesni_gf128mul_x_ble()
2777	movdqa IV, STATE2
2778	movdqu 0x50(INP), INC
2779	pxor INC, STATE2
2780	movdqu IV, 0x50(OUTP)
2781
2782	movdqu 0x20(OUTP), INC
2783	pxor INC, STATE3
2784	movdqu STATE3, 0x20(OUTP)
2785
2786	_aesni_gf128mul_x_ble()
2787	movdqa IV, STATE3
2788	movdqu 0x60(INP), INC
2789	pxor INC, STATE3
2790	movdqu IV, 0x60(OUTP)
2791
2792	movdqu 0x30(OUTP), INC
2793	pxor INC, STATE4
2794	movdqu STATE4, 0x30(OUTP)
2795
2796	_aesni_gf128mul_x_ble()
2797	movdqa IV, STATE4
2798	movdqu 0x70(INP), INC
2799	pxor INC, STATE4
2800	movdqu IV, 0x70(OUTP)
2801
2802	_aesni_gf128mul_x_ble()
2803	movups IV, (IVP)
2804
2805	CALL_NOSPEC r11
2806
2807	movdqu 0x40(OUTP), INC
2808	pxor INC, STATE1
2809	movdqu STATE1, 0x40(OUTP)
2810
2811	movdqu 0x50(OUTP), INC
2812	pxor INC, STATE2
2813	movdqu STATE2, 0x50(OUTP)
2814
2815	movdqu 0x60(OUTP), INC
2816	pxor INC, STATE3
2817	movdqu STATE3, 0x60(OUTP)
2818
2819	movdqu 0x70(OUTP), INC
2820	pxor INC, STATE4
2821	movdqu STATE4, 0x70(OUTP)
2822
2823	FRAME_END
2824	ret
2825SYM_FUNC_END(aesni_xts_crypt8)
2826
2827#endif
2828