xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision 7f2e85840871f199057e65232ebde846192ed989)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35#include <asm/nospec-branch.h>
36
37/*
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register.  This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned).  It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
44 */
45#define MOVADQ	movaps
46#define MOVUDQ	movups
47
48#ifdef __x86_64__
49
50# constants in mergeable sections, linker can reorder and merge
51.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52.align 16
53.Lgf128mul_x_ble_mask:
54	.octa 0x00000000000000010000000000000087
55.section	.rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
57POLY:   .octa 0xC2000000000000000000000000000001
58.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
60TWOONE: .octa 0x00000001000000000000000000000001
61
62.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
65.section	.rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1:      .octa 0x0000000000000000ffffffffffffffff
68.section	.rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2:      .octa 0xffffffffffffffff0000000000000000
71.section	.rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE:        .octa 0x00000000000000000000000000000001
74.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section	.rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec:        .octa 0x1
80.section	.rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc:        .octa 0x2
83
84# order of these constants should not change.
85# more specifically, ALL_F should follow SHIFT_MASK,
86# and zero should follow ALL_F
87.section	.rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
91            .octa 0x00000000000000000000000000000000
92
93.text
94
95
96#define	STACK_OFFSET    8*3
97#define	HashKey		16*0	// store HashKey <<1 mod poly here
98#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
99#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
100#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
101#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
102				// bits of  HashKey <<1 mod poly here
103				//(for Karatsuba purposes)
104#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
105				// bits of  HashKey^2 <<1 mod poly here
106				// (for Karatsuba purposes)
107#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
108				// bits of  HashKey^3 <<1 mod poly here
109				// (for Karatsuba purposes)
110#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
111				// bits of  HashKey^4 <<1 mod poly here
112				// (for Karatsuba purposes)
113#define	VARIABLE_OFFSET	16*8
114
115#define arg1 rdi
116#define arg2 rsi
117#define arg3 rdx
118#define arg4 rcx
119#define arg5 r8
120#define arg6 r9
121#define arg7 STACK_OFFSET+8(%r14)
122#define arg8 STACK_OFFSET+16(%r14)
123#define arg9 STACK_OFFSET+24(%r14)
124#define arg10 STACK_OFFSET+32(%r14)
125#define keysize 2*15*16(%arg1)
126#endif
127
128
129#define STATE1	%xmm0
130#define STATE2	%xmm4
131#define STATE3	%xmm5
132#define STATE4	%xmm6
133#define STATE	STATE1
134#define IN1	%xmm1
135#define IN2	%xmm7
136#define IN3	%xmm8
137#define IN4	%xmm9
138#define IN	IN1
139#define KEY	%xmm2
140#define IV	%xmm3
141
142#define BSWAP_MASK %xmm10
143#define CTR	%xmm11
144#define INC	%xmm12
145
146#define GF128MUL_MASK %xmm10
147
148#ifdef __x86_64__
149#define AREG	%rax
150#define KEYP	%rdi
151#define OUTP	%rsi
152#define UKEYP	OUTP
153#define INP	%rdx
154#define LEN	%rcx
155#define IVP	%r8
156#define KLEN	%r9d
157#define T1	%r10
158#define TKEYP	T1
159#define T2	%r11
160#define TCTR_LOW T2
161#else
162#define AREG	%eax
163#define KEYP	%edi
164#define OUTP	AREG
165#define UKEYP	OUTP
166#define INP	%edx
167#define LEN	%esi
168#define IVP	%ebp
169#define KLEN	%ebx
170#define T1	%ecx
171#define TKEYP	T1
172#endif
173
174
175#ifdef __x86_64__
176/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
177*
178*
179* Input: A and B (128-bits each, bit-reflected)
180* Output: C = A*B*x mod poly, (i.e. >>1 )
181* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
183*
184*/
185.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
186	movdqa	  \GH, \TMP1
187	pshufd	  $78, \GH, \TMP2
188	pshufd	  $78, \HK, \TMP3
189	pxor	  \GH, \TMP2            # TMP2 = a1+a0
190	pxor	  \HK, \TMP3            # TMP3 = b1+b0
191	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
192	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
193	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
194	pxor	  \GH, \TMP2
195	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
196	movdqa	  \TMP2, \TMP3
197	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
198	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
199	pxor	  \TMP3, \GH
200	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
201
202        # first phase of the reduction
203
204	movdqa    \GH, \TMP2
205	movdqa    \GH, \TMP3
206	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
207					# in in order to perform
208					# independent shifts
209	pslld     $31, \TMP2            # packed right shift <<31
210	pslld     $30, \TMP3            # packed right shift <<30
211	pslld     $25, \TMP4            # packed right shift <<25
212	pxor      \TMP3, \TMP2          # xor the shifted versions
213	pxor      \TMP4, \TMP2
214	movdqa    \TMP2, \TMP5
215	psrldq    $4, \TMP5             # right shift TMP5 1 DW
216	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
217	pxor      \TMP2, \GH
218
219        # second phase of the reduction
220
221	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
222					# in in order to perform
223					# independent shifts
224	movdqa    \GH,\TMP3
225	movdqa    \GH,\TMP4
226	psrld     $1,\TMP2              # packed left shift >>1
227	psrld     $2,\TMP3              # packed left shift >>2
228	psrld     $7,\TMP4              # packed left shift >>7
229	pxor      \TMP3,\TMP2		# xor the shifted versions
230	pxor      \TMP4,\TMP2
231	pxor      \TMP5, \TMP2
232	pxor      \TMP2, \GH
233	pxor      \TMP1, \GH            # result is in TMP1
234.endm
235
236# Reads DLEN bytes starting at DPTR and stores in XMMDst
237# where 0 < DLEN < 16
238# Clobbers %rax, DLEN and XMM1
239.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
240        cmp $8, \DLEN
241        jl _read_lt8_\@
242        mov (\DPTR), %rax
243        MOVQ_R64_XMM %rax, \XMMDst
244        sub $8, \DLEN
245        jz _done_read_partial_block_\@
246	xor %eax, %eax
247_read_next_byte_\@:
248        shl $8, %rax
249        mov 7(\DPTR, \DLEN, 1), %al
250        dec \DLEN
251        jnz _read_next_byte_\@
252        MOVQ_R64_XMM %rax, \XMM1
253	pslldq $8, \XMM1
254        por \XMM1, \XMMDst
255	jmp _done_read_partial_block_\@
256_read_lt8_\@:
257	xor %eax, %eax
258_read_next_byte_lt8_\@:
259        shl $8, %rax
260        mov -1(\DPTR, \DLEN, 1), %al
261        dec \DLEN
262        jnz _read_next_byte_lt8_\@
263        MOVQ_R64_XMM %rax, \XMMDst
264_done_read_partial_block_\@:
265.endm
266
267/*
268* if a = number of total plaintext bytes
269* b = floor(a/16)
270* num_initial_blocks = b mod 4
271* encrypt the initial num_initial_blocks blocks and apply ghash on
272* the ciphertext
273* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
274* are clobbered
275* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
276*/
277
278
279.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
280XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
281        MOVADQ     SHUF_MASK(%rip), %xmm14
282	mov	   arg7, %r10           # %r10 = AAD
283	mov	   arg8, %r11           # %r11 = aadLen
284	pxor	   %xmm\i, %xmm\i
285	pxor       \XMM2, \XMM2
286
287	cmp	   $16, %r11
288	jl	   _get_AAD_rest\num_initial_blocks\operation
289_get_AAD_blocks\num_initial_blocks\operation:
290	movdqu	   (%r10), %xmm\i
291	PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
292	pxor	   %xmm\i, \XMM2
293	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
294	add	   $16, %r10
295	sub	   $16, %r11
296	cmp	   $16, %r11
297	jge	   _get_AAD_blocks\num_initial_blocks\operation
298
299	movdqu	   \XMM2, %xmm\i
300
301	/* read the last <16B of AAD */
302_get_AAD_rest\num_initial_blocks\operation:
303	cmp	   $0, %r11
304	je	   _get_AAD_done\num_initial_blocks\operation
305
306	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
307	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
308	pxor	   \XMM2, %xmm\i
309	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
310
311_get_AAD_done\num_initial_blocks\operation:
312	xor	   %r11, %r11 # initialise the data pointer offset as zero
313	# start AES for num_initial_blocks blocks
314
315	mov	   %arg5, %rax                      # %rax = *Y0
316	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
317	PSHUFB_XMM   %xmm14, \XMM0
318
319.if (\i == 5) || (\i == 6) || (\i == 7)
320	MOVADQ		ONE(%RIP),\TMP1
321	MOVADQ		(%arg1),\TMP2
322.irpc index, \i_seq
323	paddd	   \TMP1, \XMM0                 # INCR Y0
324	movdqa	   \XMM0, %xmm\index
325	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
326	pxor	   \TMP2, %xmm\index
327.endr
328	lea	0x10(%arg1),%r10
329	mov	keysize,%eax
330	shr	$2,%eax				# 128->4, 192->6, 256->8
331	add	$5,%eax			      # 128->9, 192->11, 256->13
332
333aes_loop_initial_dec\num_initial_blocks:
334	MOVADQ	(%r10),\TMP1
335.irpc	index, \i_seq
336	AESENC	\TMP1, %xmm\index
337.endr
338	add	$16,%r10
339	sub	$1,%eax
340	jnz	aes_loop_initial_dec\num_initial_blocks
341
342	MOVADQ	(%r10), \TMP1
343.irpc index, \i_seq
344	AESENCLAST \TMP1, %xmm\index         # Last Round
345.endr
346.irpc index, \i_seq
347	movdqu	   (%arg3 , %r11, 1), \TMP1
348	pxor	   \TMP1, %xmm\index
349	movdqu	   %xmm\index, (%arg2 , %r11, 1)
350	# write back plaintext/ciphertext for num_initial_blocks
351	add	   $16, %r11
352
353	movdqa     \TMP1, %xmm\index
354	PSHUFB_XMM	   %xmm14, %xmm\index
355                # prepare plaintext/ciphertext for GHASH computation
356.endr
357.endif
358
359        # apply GHASH on num_initial_blocks blocks
360
361.if \i == 5
362        pxor       %xmm5, %xmm6
363	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
364        pxor       %xmm6, %xmm7
365	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
366        pxor       %xmm7, %xmm8
367	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
368.elseif \i == 6
369        pxor       %xmm6, %xmm7
370	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
371        pxor       %xmm7, %xmm8
372	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
373.elseif \i == 7
374        pxor       %xmm7, %xmm8
375	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
376.endif
377	cmp	   $64, %r13
378	jl	_initial_blocks_done\num_initial_blocks\operation
379	# no need for precomputed values
380/*
381*
382* Precomputations for HashKey parallel with encryption of first 4 blocks.
383* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
384*/
385	MOVADQ	   ONE(%rip), \TMP1
386	paddd	   \TMP1, \XMM0              # INCR Y0
387	MOVADQ	   \XMM0, \XMM1
388	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
389
390	paddd	   \TMP1, \XMM0              # INCR Y0
391	MOVADQ	   \XMM0, \XMM2
392	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
393
394	paddd	   \TMP1, \XMM0              # INCR Y0
395	MOVADQ	   \XMM0, \XMM3
396	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
397
398	paddd	   \TMP1, \XMM0              # INCR Y0
399	MOVADQ	   \XMM0, \XMM4
400	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
401
402	MOVADQ	   0(%arg1),\TMP1
403	pxor	   \TMP1, \XMM1
404	pxor	   \TMP1, \XMM2
405	pxor	   \TMP1, \XMM3
406	pxor	   \TMP1, \XMM4
407	movdqa	   \TMP3, \TMP5
408	pshufd	   $78, \TMP3, \TMP1
409	pxor	   \TMP3, \TMP1
410	movdqa	   \TMP1, HashKey_k(%rsp)
411	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
412# TMP5 = HashKey^2<<1 (mod poly)
413	movdqa	   \TMP5, HashKey_2(%rsp)
414# HashKey_2 = HashKey^2<<1 (mod poly)
415	pshufd	   $78, \TMP5, \TMP1
416	pxor	   \TMP5, \TMP1
417	movdqa	   \TMP1, HashKey_2_k(%rsp)
418.irpc index, 1234 # do 4 rounds
419	movaps 0x10*\index(%arg1), \TMP1
420	AESENC	   \TMP1, \XMM1
421	AESENC	   \TMP1, \XMM2
422	AESENC	   \TMP1, \XMM3
423	AESENC	   \TMP1, \XMM4
424.endr
425	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
426# TMP5 = HashKey^3<<1 (mod poly)
427	movdqa	   \TMP5, HashKey_3(%rsp)
428	pshufd	   $78, \TMP5, \TMP1
429	pxor	   \TMP5, \TMP1
430	movdqa	   \TMP1, HashKey_3_k(%rsp)
431.irpc index, 56789 # do next 5 rounds
432	movaps 0x10*\index(%arg1), \TMP1
433	AESENC	   \TMP1, \XMM1
434	AESENC	   \TMP1, \XMM2
435	AESENC	   \TMP1, \XMM3
436	AESENC	   \TMP1, \XMM4
437.endr
438	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
439# TMP5 = HashKey^3<<1 (mod poly)
440	movdqa	   \TMP5, HashKey_4(%rsp)
441	pshufd	   $78, \TMP5, \TMP1
442	pxor	   \TMP5, \TMP1
443	movdqa	   \TMP1, HashKey_4_k(%rsp)
444	lea	   0xa0(%arg1),%r10
445	mov	   keysize,%eax
446	shr	   $2,%eax			# 128->4, 192->6, 256->8
447	sub	   $4,%eax			# 128->0, 192->2, 256->4
448	jz	   aes_loop_pre_dec_done\num_initial_blocks
449
450aes_loop_pre_dec\num_initial_blocks:
451	MOVADQ	   (%r10),\TMP2
452.irpc	index, 1234
453	AESENC	   \TMP2, %xmm\index
454.endr
455	add	   $16,%r10
456	sub	   $1,%eax
457	jnz	   aes_loop_pre_dec\num_initial_blocks
458
459aes_loop_pre_dec_done\num_initial_blocks:
460	MOVADQ	   (%r10), \TMP2
461	AESENCLAST \TMP2, \XMM1
462	AESENCLAST \TMP2, \XMM2
463	AESENCLAST \TMP2, \XMM3
464	AESENCLAST \TMP2, \XMM4
465	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
466	pxor	   \TMP1, \XMM1
467	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
468	movdqa     \TMP1, \XMM1
469	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
470	pxor	   \TMP1, \XMM2
471	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
472	movdqa     \TMP1, \XMM2
473	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
474	pxor	   \TMP1, \XMM3
475	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
476	movdqa     \TMP1, \XMM3
477	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
478	pxor	   \TMP1, \XMM4
479	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
480	movdqa     \TMP1, \XMM4
481	add	   $64, %r11
482	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
483	pxor	   \XMMDst, \XMM1
484# combine GHASHed value with the corresponding ciphertext
485	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
486	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
487	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
488
489_initial_blocks_done\num_initial_blocks\operation:
490
491.endm
492
493
494/*
495* if a = number of total plaintext bytes
496* b = floor(a/16)
497* num_initial_blocks = b mod 4
498* encrypt the initial num_initial_blocks blocks and apply ghash on
499* the ciphertext
500* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
501* are clobbered
502* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
503*/
504
505
506.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
507XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
508        MOVADQ     SHUF_MASK(%rip), %xmm14
509	mov	   arg7, %r10           # %r10 = AAD
510	mov	   arg8, %r11           # %r11 = aadLen
511	pxor	   %xmm\i, %xmm\i
512	pxor	   \XMM2, \XMM2
513
514	cmp	   $16, %r11
515	jl	   _get_AAD_rest\num_initial_blocks\operation
516_get_AAD_blocks\num_initial_blocks\operation:
517	movdqu	   (%r10), %xmm\i
518	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
519	pxor	   %xmm\i, \XMM2
520	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
521	add	   $16, %r10
522	sub	   $16, %r11
523	cmp	   $16, %r11
524	jge	   _get_AAD_blocks\num_initial_blocks\operation
525
526	movdqu	   \XMM2, %xmm\i
527
528	/* read the last <16B of AAD */
529_get_AAD_rest\num_initial_blocks\operation:
530	cmp	   $0, %r11
531	je	   _get_AAD_done\num_initial_blocks\operation
532
533	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
534	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
535	pxor	   \XMM2, %xmm\i
536	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
537
538_get_AAD_done\num_initial_blocks\operation:
539	xor	   %r11, %r11 # initialise the data pointer offset as zero
540	# start AES for num_initial_blocks blocks
541
542	mov	   %arg5, %rax                      # %rax = *Y0
543	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
544	PSHUFB_XMM   %xmm14, \XMM0
545
546.if (\i == 5) || (\i == 6) || (\i == 7)
547
548	MOVADQ		ONE(%RIP),\TMP1
549	MOVADQ		0(%arg1),\TMP2
550.irpc index, \i_seq
551	paddd		\TMP1, \XMM0                 # INCR Y0
552	MOVADQ		\XMM0, %xmm\index
553	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
554	pxor		\TMP2, %xmm\index
555.endr
556	lea	0x10(%arg1),%r10
557	mov	keysize,%eax
558	shr	$2,%eax				# 128->4, 192->6, 256->8
559	add	$5,%eax			      # 128->9, 192->11, 256->13
560
561aes_loop_initial_enc\num_initial_blocks:
562	MOVADQ	(%r10),\TMP1
563.irpc	index, \i_seq
564	AESENC	\TMP1, %xmm\index
565.endr
566	add	$16,%r10
567	sub	$1,%eax
568	jnz	aes_loop_initial_enc\num_initial_blocks
569
570	MOVADQ	(%r10), \TMP1
571.irpc index, \i_seq
572	AESENCLAST \TMP1, %xmm\index         # Last Round
573.endr
574.irpc index, \i_seq
575	movdqu	   (%arg3 , %r11, 1), \TMP1
576	pxor	   \TMP1, %xmm\index
577	movdqu	   %xmm\index, (%arg2 , %r11, 1)
578	# write back plaintext/ciphertext for num_initial_blocks
579	add	   $16, %r11
580	PSHUFB_XMM	   %xmm14, %xmm\index
581
582		# prepare plaintext/ciphertext for GHASH computation
583.endr
584.endif
585
586        # apply GHASH on num_initial_blocks blocks
587
588.if \i == 5
589        pxor       %xmm5, %xmm6
590	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
591        pxor       %xmm6, %xmm7
592	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
593        pxor       %xmm7, %xmm8
594	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
595.elseif \i == 6
596        pxor       %xmm6, %xmm7
597	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
598        pxor       %xmm7, %xmm8
599	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
600.elseif \i == 7
601        pxor       %xmm7, %xmm8
602	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
603.endif
604	cmp	   $64, %r13
605	jl	_initial_blocks_done\num_initial_blocks\operation
606	# no need for precomputed values
607/*
608*
609* Precomputations for HashKey parallel with encryption of first 4 blocks.
610* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
611*/
612	MOVADQ	   ONE(%RIP),\TMP1
613	paddd	   \TMP1, \XMM0              # INCR Y0
614	MOVADQ	   \XMM0, \XMM1
615	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
616
617	paddd	   \TMP1, \XMM0              # INCR Y0
618	MOVADQ	   \XMM0, \XMM2
619	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
620
621	paddd	   \TMP1, \XMM0              # INCR Y0
622	MOVADQ	   \XMM0, \XMM3
623	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
624
625	paddd	   \TMP1, \XMM0              # INCR Y0
626	MOVADQ	   \XMM0, \XMM4
627	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
628
629	MOVADQ	   0(%arg1),\TMP1
630	pxor	   \TMP1, \XMM1
631	pxor	   \TMP1, \XMM2
632	pxor	   \TMP1, \XMM3
633	pxor	   \TMP1, \XMM4
634	movdqa	   \TMP3, \TMP5
635	pshufd	   $78, \TMP3, \TMP1
636	pxor	   \TMP3, \TMP1
637	movdqa	   \TMP1, HashKey_k(%rsp)
638	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
639# TMP5 = HashKey^2<<1 (mod poly)
640	movdqa	   \TMP5, HashKey_2(%rsp)
641# HashKey_2 = HashKey^2<<1 (mod poly)
642	pshufd	   $78, \TMP5, \TMP1
643	pxor	   \TMP5, \TMP1
644	movdqa	   \TMP1, HashKey_2_k(%rsp)
645.irpc index, 1234 # do 4 rounds
646	movaps 0x10*\index(%arg1), \TMP1
647	AESENC	   \TMP1, \XMM1
648	AESENC	   \TMP1, \XMM2
649	AESENC	   \TMP1, \XMM3
650	AESENC	   \TMP1, \XMM4
651.endr
652	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
653# TMP5 = HashKey^3<<1 (mod poly)
654	movdqa	   \TMP5, HashKey_3(%rsp)
655	pshufd	   $78, \TMP5, \TMP1
656	pxor	   \TMP5, \TMP1
657	movdqa	   \TMP1, HashKey_3_k(%rsp)
658.irpc index, 56789 # do next 5 rounds
659	movaps 0x10*\index(%arg1), \TMP1
660	AESENC	   \TMP1, \XMM1
661	AESENC	   \TMP1, \XMM2
662	AESENC	   \TMP1, \XMM3
663	AESENC	   \TMP1, \XMM4
664.endr
665	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
666# TMP5 = HashKey^3<<1 (mod poly)
667	movdqa	   \TMP5, HashKey_4(%rsp)
668	pshufd	   $78, \TMP5, \TMP1
669	pxor	   \TMP5, \TMP1
670	movdqa	   \TMP1, HashKey_4_k(%rsp)
671	lea	   0xa0(%arg1),%r10
672	mov	   keysize,%eax
673	shr	   $2,%eax			# 128->4, 192->6, 256->8
674	sub	   $4,%eax			# 128->0, 192->2, 256->4
675	jz	   aes_loop_pre_enc_done\num_initial_blocks
676
677aes_loop_pre_enc\num_initial_blocks:
678	MOVADQ	   (%r10),\TMP2
679.irpc	index, 1234
680	AESENC	   \TMP2, %xmm\index
681.endr
682	add	   $16,%r10
683	sub	   $1,%eax
684	jnz	   aes_loop_pre_enc\num_initial_blocks
685
686aes_loop_pre_enc_done\num_initial_blocks:
687	MOVADQ	   (%r10), \TMP2
688	AESENCLAST \TMP2, \XMM1
689	AESENCLAST \TMP2, \XMM2
690	AESENCLAST \TMP2, \XMM3
691	AESENCLAST \TMP2, \XMM4
692	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
693	pxor	   \TMP1, \XMM1
694	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
695	pxor	   \TMP1, \XMM2
696	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
697	pxor	   \TMP1, \XMM3
698	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
699	pxor	   \TMP1, \XMM4
700	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
701	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
702	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
703	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
704
705	add	   $64, %r11
706	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
707	pxor	   \XMMDst, \XMM1
708# combine GHASHed value with the corresponding ciphertext
709	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
710	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
711	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
712
713_initial_blocks_done\num_initial_blocks\operation:
714
715.endm
716
717/*
718* encrypt 4 blocks at a time
719* ghash the 4 previously encrypted ciphertext blocks
720* arg1, %arg2, %arg3 are used as pointers only, not modified
721* %r11 is the data offset value
722*/
723.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
724TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
725
726	movdqa	  \XMM1, \XMM5
727	movdqa	  \XMM2, \XMM6
728	movdqa	  \XMM3, \XMM7
729	movdqa	  \XMM4, \XMM8
730
731        movdqa    SHUF_MASK(%rip), %xmm15
732        # multiply TMP5 * HashKey using karatsuba
733
734	movdqa	  \XMM5, \TMP4
735	pshufd	  $78, \XMM5, \TMP6
736	pxor	  \XMM5, \TMP6
737	paddd     ONE(%rip), \XMM0		# INCR CNT
738	movdqa	  HashKey_4(%rsp), \TMP5
739	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
740	movdqa    \XMM0, \XMM1
741	paddd     ONE(%rip), \XMM0		# INCR CNT
742	movdqa    \XMM0, \XMM2
743	paddd     ONE(%rip), \XMM0		# INCR CNT
744	movdqa    \XMM0, \XMM3
745	paddd     ONE(%rip), \XMM0		# INCR CNT
746	movdqa    \XMM0, \XMM4
747	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
748	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
749	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
750	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
751	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
752
753	pxor	  (%arg1), \XMM1
754	pxor	  (%arg1), \XMM2
755	pxor	  (%arg1), \XMM3
756	pxor	  (%arg1), \XMM4
757	movdqa	  HashKey_4_k(%rsp), \TMP5
758	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
759	movaps 0x10(%arg1), \TMP1
760	AESENC	  \TMP1, \XMM1              # Round 1
761	AESENC	  \TMP1, \XMM2
762	AESENC	  \TMP1, \XMM3
763	AESENC	  \TMP1, \XMM4
764	movaps 0x20(%arg1), \TMP1
765	AESENC	  \TMP1, \XMM1              # Round 2
766	AESENC	  \TMP1, \XMM2
767	AESENC	  \TMP1, \XMM3
768	AESENC	  \TMP1, \XMM4
769	movdqa	  \XMM6, \TMP1
770	pshufd	  $78, \XMM6, \TMP2
771	pxor	  \XMM6, \TMP2
772	movdqa	  HashKey_3(%rsp), \TMP5
773	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
774	movaps 0x30(%arg1), \TMP3
775	AESENC    \TMP3, \XMM1              # Round 3
776	AESENC    \TMP3, \XMM2
777	AESENC    \TMP3, \XMM3
778	AESENC    \TMP3, \XMM4
779	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
780	movaps 0x40(%arg1), \TMP3
781	AESENC	  \TMP3, \XMM1              # Round 4
782	AESENC	  \TMP3, \XMM2
783	AESENC	  \TMP3, \XMM3
784	AESENC	  \TMP3, \XMM4
785	movdqa	  HashKey_3_k(%rsp), \TMP5
786	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
787	movaps 0x50(%arg1), \TMP3
788	AESENC	  \TMP3, \XMM1              # Round 5
789	AESENC	  \TMP3, \XMM2
790	AESENC	  \TMP3, \XMM3
791	AESENC	  \TMP3, \XMM4
792	pxor	  \TMP1, \TMP4
793# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
794	pxor	  \XMM6, \XMM5
795	pxor	  \TMP2, \TMP6
796	movdqa	  \XMM7, \TMP1
797	pshufd	  $78, \XMM7, \TMP2
798	pxor	  \XMM7, \TMP2
799	movdqa	  HashKey_2(%rsp ), \TMP5
800
801        # Multiply TMP5 * HashKey using karatsuba
802
803	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
804	movaps 0x60(%arg1), \TMP3
805	AESENC	  \TMP3, \XMM1              # Round 6
806	AESENC	  \TMP3, \XMM2
807	AESENC	  \TMP3, \XMM3
808	AESENC	  \TMP3, \XMM4
809	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
810	movaps 0x70(%arg1), \TMP3
811	AESENC	  \TMP3, \XMM1             # Round 7
812	AESENC	  \TMP3, \XMM2
813	AESENC	  \TMP3, \XMM3
814	AESENC	  \TMP3, \XMM4
815	movdqa	  HashKey_2_k(%rsp), \TMP5
816	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
817	movaps 0x80(%arg1), \TMP3
818	AESENC	  \TMP3, \XMM1             # Round 8
819	AESENC	  \TMP3, \XMM2
820	AESENC	  \TMP3, \XMM3
821	AESENC	  \TMP3, \XMM4
822	pxor	  \TMP1, \TMP4
823# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
824	pxor	  \XMM7, \XMM5
825	pxor	  \TMP2, \TMP6
826
827        # Multiply XMM8 * HashKey
828        # XMM8 and TMP5 hold the values for the two operands
829
830	movdqa	  \XMM8, \TMP1
831	pshufd	  $78, \XMM8, \TMP2
832	pxor	  \XMM8, \TMP2
833	movdqa	  HashKey(%rsp), \TMP5
834	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
835	movaps 0x90(%arg1), \TMP3
836	AESENC	  \TMP3, \XMM1            # Round 9
837	AESENC	  \TMP3, \XMM2
838	AESENC	  \TMP3, \XMM3
839	AESENC	  \TMP3, \XMM4
840	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
841	lea	  0xa0(%arg1),%r10
842	mov	  keysize,%eax
843	shr	  $2,%eax			# 128->4, 192->6, 256->8
844	sub	  $4,%eax			# 128->0, 192->2, 256->4
845	jz	  aes_loop_par_enc_done
846
847aes_loop_par_enc:
848	MOVADQ	  (%r10),\TMP3
849.irpc	index, 1234
850	AESENC	  \TMP3, %xmm\index
851.endr
852	add	  $16,%r10
853	sub	  $1,%eax
854	jnz	  aes_loop_par_enc
855
856aes_loop_par_enc_done:
857	MOVADQ	  (%r10), \TMP3
858	AESENCLAST \TMP3, \XMM1           # Round 10
859	AESENCLAST \TMP3, \XMM2
860	AESENCLAST \TMP3, \XMM3
861	AESENCLAST \TMP3, \XMM4
862	movdqa    HashKey_k(%rsp), \TMP5
863	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
864	movdqu	  (%arg3,%r11,1), \TMP3
865	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
866	movdqu	  16(%arg3,%r11,1), \TMP3
867	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
868	movdqu	  32(%arg3,%r11,1), \TMP3
869	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
870	movdqu	  48(%arg3,%r11,1), \TMP3
871	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
872        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
873        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
874        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
875        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
876	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
877	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
878	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
879	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
880
881	pxor	  \TMP4, \TMP1
882	pxor	  \XMM8, \XMM5
883	pxor	  \TMP6, \TMP2
884	pxor	  \TMP1, \TMP2
885	pxor	  \XMM5, \TMP2
886	movdqa	  \TMP2, \TMP3
887	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
888	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
889	pxor	  \TMP3, \XMM5
890	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
891
892        # first phase of reduction
893
894	movdqa    \XMM5, \TMP2
895	movdqa    \XMM5, \TMP3
896	movdqa    \XMM5, \TMP4
897# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
898	pslld     $31, \TMP2                   # packed right shift << 31
899	pslld     $30, \TMP3                   # packed right shift << 30
900	pslld     $25, \TMP4                   # packed right shift << 25
901	pxor      \TMP3, \TMP2	               # xor the shifted versions
902	pxor      \TMP4, \TMP2
903	movdqa    \TMP2, \TMP5
904	psrldq    $4, \TMP5                    # right shift T5 1 DW
905	pslldq    $12, \TMP2                   # left shift T2 3 DWs
906	pxor      \TMP2, \XMM5
907
908        # second phase of reduction
909
910	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
911	movdqa    \XMM5,\TMP3
912	movdqa    \XMM5,\TMP4
913	psrld     $1, \TMP2                    # packed left shift >>1
914	psrld     $2, \TMP3                    # packed left shift >>2
915	psrld     $7, \TMP4                    # packed left shift >>7
916	pxor      \TMP3,\TMP2		       # xor the shifted versions
917	pxor      \TMP4,\TMP2
918	pxor      \TMP5, \TMP2
919	pxor      \TMP2, \XMM5
920	pxor      \TMP1, \XMM5                 # result is in TMP1
921
922	pxor	  \XMM5, \XMM1
923.endm
924
925/*
926* decrypt 4 blocks at a time
927* ghash the 4 previously decrypted ciphertext blocks
928* arg1, %arg2, %arg3 are used as pointers only, not modified
929* %r11 is the data offset value
930*/
931.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
932TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
933
934	movdqa	  \XMM1, \XMM5
935	movdqa	  \XMM2, \XMM6
936	movdqa	  \XMM3, \XMM7
937	movdqa	  \XMM4, \XMM8
938
939        movdqa    SHUF_MASK(%rip), %xmm15
940        # multiply TMP5 * HashKey using karatsuba
941
942	movdqa	  \XMM5, \TMP4
943	pshufd	  $78, \XMM5, \TMP6
944	pxor	  \XMM5, \TMP6
945	paddd     ONE(%rip), \XMM0		# INCR CNT
946	movdqa	  HashKey_4(%rsp), \TMP5
947	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
948	movdqa    \XMM0, \XMM1
949	paddd     ONE(%rip), \XMM0		# INCR CNT
950	movdqa    \XMM0, \XMM2
951	paddd     ONE(%rip), \XMM0		# INCR CNT
952	movdqa    \XMM0, \XMM3
953	paddd     ONE(%rip), \XMM0		# INCR CNT
954	movdqa    \XMM0, \XMM4
955	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
956	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
957	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
958	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
959	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
960
961	pxor	  (%arg1), \XMM1
962	pxor	  (%arg1), \XMM2
963	pxor	  (%arg1), \XMM3
964	pxor	  (%arg1), \XMM4
965	movdqa	  HashKey_4_k(%rsp), \TMP5
966	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
967	movaps 0x10(%arg1), \TMP1
968	AESENC	  \TMP1, \XMM1              # Round 1
969	AESENC	  \TMP1, \XMM2
970	AESENC	  \TMP1, \XMM3
971	AESENC	  \TMP1, \XMM4
972	movaps 0x20(%arg1), \TMP1
973	AESENC	  \TMP1, \XMM1              # Round 2
974	AESENC	  \TMP1, \XMM2
975	AESENC	  \TMP1, \XMM3
976	AESENC	  \TMP1, \XMM4
977	movdqa	  \XMM6, \TMP1
978	pshufd	  $78, \XMM6, \TMP2
979	pxor	  \XMM6, \TMP2
980	movdqa	  HashKey_3(%rsp), \TMP5
981	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
982	movaps 0x30(%arg1), \TMP3
983	AESENC    \TMP3, \XMM1              # Round 3
984	AESENC    \TMP3, \XMM2
985	AESENC    \TMP3, \XMM3
986	AESENC    \TMP3, \XMM4
987	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
988	movaps 0x40(%arg1), \TMP3
989	AESENC	  \TMP3, \XMM1              # Round 4
990	AESENC	  \TMP3, \XMM2
991	AESENC	  \TMP3, \XMM3
992	AESENC	  \TMP3, \XMM4
993	movdqa	  HashKey_3_k(%rsp), \TMP5
994	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
995	movaps 0x50(%arg1), \TMP3
996	AESENC	  \TMP3, \XMM1              # Round 5
997	AESENC	  \TMP3, \XMM2
998	AESENC	  \TMP3, \XMM3
999	AESENC	  \TMP3, \XMM4
1000	pxor	  \TMP1, \TMP4
1001# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1002	pxor	  \XMM6, \XMM5
1003	pxor	  \TMP2, \TMP6
1004	movdqa	  \XMM7, \TMP1
1005	pshufd	  $78, \XMM7, \TMP2
1006	pxor	  \XMM7, \TMP2
1007	movdqa	  HashKey_2(%rsp ), \TMP5
1008
1009        # Multiply TMP5 * HashKey using karatsuba
1010
1011	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1012	movaps 0x60(%arg1), \TMP3
1013	AESENC	  \TMP3, \XMM1              # Round 6
1014	AESENC	  \TMP3, \XMM2
1015	AESENC	  \TMP3, \XMM3
1016	AESENC	  \TMP3, \XMM4
1017	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1018	movaps 0x70(%arg1), \TMP3
1019	AESENC	  \TMP3, \XMM1             # Round 7
1020	AESENC	  \TMP3, \XMM2
1021	AESENC	  \TMP3, \XMM3
1022	AESENC	  \TMP3, \XMM4
1023	movdqa	  HashKey_2_k(%rsp), \TMP5
1024	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1025	movaps 0x80(%arg1), \TMP3
1026	AESENC	  \TMP3, \XMM1             # Round 8
1027	AESENC	  \TMP3, \XMM2
1028	AESENC	  \TMP3, \XMM3
1029	AESENC	  \TMP3, \XMM4
1030	pxor	  \TMP1, \TMP4
1031# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1032	pxor	  \XMM7, \XMM5
1033	pxor	  \TMP2, \TMP6
1034
1035        # Multiply XMM8 * HashKey
1036        # XMM8 and TMP5 hold the values for the two operands
1037
1038	movdqa	  \XMM8, \TMP1
1039	pshufd	  $78, \XMM8, \TMP2
1040	pxor	  \XMM8, \TMP2
1041	movdqa	  HashKey(%rsp), \TMP5
1042	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1043	movaps 0x90(%arg1), \TMP3
1044	AESENC	  \TMP3, \XMM1            # Round 9
1045	AESENC	  \TMP3, \XMM2
1046	AESENC	  \TMP3, \XMM3
1047	AESENC	  \TMP3, \XMM4
1048	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1049	lea	  0xa0(%arg1),%r10
1050	mov	  keysize,%eax
1051	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1052	sub	  $4,%eax			# 128->0, 192->2, 256->4
1053	jz	  aes_loop_par_dec_done
1054
1055aes_loop_par_dec:
1056	MOVADQ	  (%r10),\TMP3
1057.irpc	index, 1234
1058	AESENC	  \TMP3, %xmm\index
1059.endr
1060	add	  $16,%r10
1061	sub	  $1,%eax
1062	jnz	  aes_loop_par_dec
1063
1064aes_loop_par_dec_done:
1065	MOVADQ	  (%r10), \TMP3
1066	AESENCLAST \TMP3, \XMM1           # last round
1067	AESENCLAST \TMP3, \XMM2
1068	AESENCLAST \TMP3, \XMM3
1069	AESENCLAST \TMP3, \XMM4
1070	movdqa    HashKey_k(%rsp), \TMP5
1071	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1072	movdqu	  (%arg3,%r11,1), \TMP3
1073	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1074	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
1075	movdqa    \TMP3, \XMM1
1076	movdqu	  16(%arg3,%r11,1), \TMP3
1077	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1078	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1079	movdqa    \TMP3, \XMM2
1080	movdqu	  32(%arg3,%r11,1), \TMP3
1081	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1082	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1083	movdqa    \TMP3, \XMM3
1084	movdqu	  48(%arg3,%r11,1), \TMP3
1085	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1086	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1087	movdqa    \TMP3, \XMM4
1088	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1089	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1090	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1091	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1092
1093	pxor	  \TMP4, \TMP1
1094	pxor	  \XMM8, \XMM5
1095	pxor	  \TMP6, \TMP2
1096	pxor	  \TMP1, \TMP2
1097	pxor	  \XMM5, \TMP2
1098	movdqa	  \TMP2, \TMP3
1099	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1100	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1101	pxor	  \TMP3, \XMM5
1102	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1103
1104        # first phase of reduction
1105
1106	movdqa    \XMM5, \TMP2
1107	movdqa    \XMM5, \TMP3
1108	movdqa    \XMM5, \TMP4
1109# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1110	pslld     $31, \TMP2                   # packed right shift << 31
1111	pslld     $30, \TMP3                   # packed right shift << 30
1112	pslld     $25, \TMP4                   # packed right shift << 25
1113	pxor      \TMP3, \TMP2	               # xor the shifted versions
1114	pxor      \TMP4, \TMP2
1115	movdqa    \TMP2, \TMP5
1116	psrldq    $4, \TMP5                    # right shift T5 1 DW
1117	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1118	pxor      \TMP2, \XMM5
1119
1120        # second phase of reduction
1121
1122	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1123	movdqa    \XMM5,\TMP3
1124	movdqa    \XMM5,\TMP4
1125	psrld     $1, \TMP2                    # packed left shift >>1
1126	psrld     $2, \TMP3                    # packed left shift >>2
1127	psrld     $7, \TMP4                    # packed left shift >>7
1128	pxor      \TMP3,\TMP2		       # xor the shifted versions
1129	pxor      \TMP4,\TMP2
1130	pxor      \TMP5, \TMP2
1131	pxor      \TMP2, \XMM5
1132	pxor      \TMP1, \XMM5                 # result is in TMP1
1133
1134	pxor	  \XMM5, \XMM1
1135.endm
1136
1137/* GHASH the last 4 ciphertext blocks. */
1138.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1139TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1140
1141        # Multiply TMP6 * HashKey (using Karatsuba)
1142
1143	movdqa	  \XMM1, \TMP6
1144	pshufd	  $78, \XMM1, \TMP2
1145	pxor	  \XMM1, \TMP2
1146	movdqa	  HashKey_4(%rsp), \TMP5
1147	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1148	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1149	movdqa	  HashKey_4_k(%rsp), \TMP4
1150	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1151	movdqa	  \XMM1, \XMMDst
1152	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1153
1154        # Multiply TMP1 * HashKey (using Karatsuba)
1155
1156	movdqa	  \XMM2, \TMP1
1157	pshufd	  $78, \XMM2, \TMP2
1158	pxor	  \XMM2, \TMP2
1159	movdqa	  HashKey_3(%rsp), \TMP5
1160	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1161	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1162	movdqa	  HashKey_3_k(%rsp), \TMP4
1163	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1164	pxor	  \TMP1, \TMP6
1165	pxor	  \XMM2, \XMMDst
1166	pxor	  \TMP2, \XMM1
1167# results accumulated in TMP6, XMMDst, XMM1
1168
1169        # Multiply TMP1 * HashKey (using Karatsuba)
1170
1171	movdqa	  \XMM3, \TMP1
1172	pshufd	  $78, \XMM3, \TMP2
1173	pxor	  \XMM3, \TMP2
1174	movdqa	  HashKey_2(%rsp), \TMP5
1175	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1176	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1177	movdqa	  HashKey_2_k(%rsp), \TMP4
1178	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1179	pxor	  \TMP1, \TMP6
1180	pxor	  \XMM3, \XMMDst
1181	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1182
1183        # Multiply TMP1 * HashKey (using Karatsuba)
1184	movdqa	  \XMM4, \TMP1
1185	pshufd	  $78, \XMM4, \TMP2
1186	pxor	  \XMM4, \TMP2
1187	movdqa	  HashKey(%rsp), \TMP5
1188	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1189	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1190	movdqa	  HashKey_k(%rsp), \TMP4
1191	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1192	pxor	  \TMP1, \TMP6
1193	pxor	  \XMM4, \XMMDst
1194	pxor	  \XMM1, \TMP2
1195	pxor	  \TMP6, \TMP2
1196	pxor	  \XMMDst, \TMP2
1197	# middle section of the temp results combined as in karatsuba algorithm
1198	movdqa	  \TMP2, \TMP4
1199	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1200	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1201	pxor	  \TMP4, \XMMDst
1202	pxor	  \TMP2, \TMP6
1203# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1204	# first phase of the reduction
1205	movdqa    \XMMDst, \TMP2
1206	movdqa    \XMMDst, \TMP3
1207	movdqa    \XMMDst, \TMP4
1208# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1209	pslld     $31, \TMP2                # packed right shifting << 31
1210	pslld     $30, \TMP3                # packed right shifting << 30
1211	pslld     $25, \TMP4                # packed right shifting << 25
1212	pxor      \TMP3, \TMP2              # xor the shifted versions
1213	pxor      \TMP4, \TMP2
1214	movdqa    \TMP2, \TMP7
1215	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1216	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1217	pxor      \TMP2, \XMMDst
1218
1219        # second phase of the reduction
1220	movdqa    \XMMDst, \TMP2
1221	# make 3 copies of XMMDst for doing 3 shift operations
1222	movdqa    \XMMDst, \TMP3
1223	movdqa    \XMMDst, \TMP4
1224	psrld     $1, \TMP2                 # packed left shift >> 1
1225	psrld     $2, \TMP3                 # packed left shift >> 2
1226	psrld     $7, \TMP4                 # packed left shift >> 7
1227	pxor      \TMP3, \TMP2              # xor the shifted versions
1228	pxor      \TMP4, \TMP2
1229	pxor      \TMP7, \TMP2
1230	pxor      \TMP2, \XMMDst
1231	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1232.endm
1233
1234
1235/* Encryption of a single block
1236* uses eax & r10
1237*/
1238
1239.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1240
1241	pxor		(%arg1), \XMM0
1242	mov		keysize,%eax
1243	shr		$2,%eax			# 128->4, 192->6, 256->8
1244	add		$5,%eax			# 128->9, 192->11, 256->13
1245	lea		16(%arg1), %r10	  # get first expanded key address
1246
1247_esb_loop_\@:
1248	MOVADQ		(%r10),\TMP1
1249	AESENC		\TMP1,\XMM0
1250	add		$16,%r10
1251	sub		$1,%eax
1252	jnz		_esb_loop_\@
1253
1254	MOVADQ		(%r10),\TMP1
1255	AESENCLAST	\TMP1,\XMM0
1256.endm
1257/*****************************************************************************
1258* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1259*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1260*                   const u8 *in,      // Ciphertext input
1261*                   u64 plaintext_len, // Length of data in bytes for decryption.
1262*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1263*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1264*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1265*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1266*                   const u8 *aad,     // Additional Authentication Data (AAD)
1267*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1268*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1269*                                      // given authentication tag and only return the plaintext if they match.
1270*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1271*                                      // (most likely), 12 or 8.
1272*
1273* Assumptions:
1274*
1275* keys:
1276*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1277*       set of 11 keys in the data structure void *aes_ctx
1278*
1279* iv:
1280*       0                   1                   2                   3
1281*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1282*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1283*       |                             Salt  (From the SA)               |
1284*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1285*       |                     Initialization Vector                     |
1286*       |         (This is the sequence number from IPSec header)       |
1287*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1288*       |                              0x1                              |
1289*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1290*
1291*
1292*
1293* AAD:
1294*       AAD padded to 128 bits with 0
1295*       for example, assume AAD is a u32 vector
1296*
1297*       if AAD is 8 bytes:
1298*       AAD[3] = {A0, A1};
1299*       padded AAD in xmm register = {A1 A0 0 0}
1300*
1301*       0                   1                   2                   3
1302*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1303*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1304*       |                               SPI (A1)                        |
1305*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1306*       |                     32-bit Sequence Number (A0)               |
1307*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1308*       |                              0x0                              |
1309*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1310*
1311*                                       AAD Format with 32-bit Sequence Number
1312*
1313*       if AAD is 12 bytes:
1314*       AAD[3] = {A0, A1, A2};
1315*       padded AAD in xmm register = {A2 A1 A0 0}
1316*
1317*       0                   1                   2                   3
1318*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1319*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1320*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1321*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1322*       |                               SPI (A2)                        |
1323*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1324*       |                 64-bit Extended Sequence Number {A1,A0}       |
1325*       |                                                               |
1326*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1327*       |                              0x0                              |
1328*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1329*
1330*                        AAD Format with 64-bit Extended Sequence Number
1331*
1332* poly = x^128 + x^127 + x^126 + x^121 + 1
1333*
1334*****************************************************************************/
1335ENTRY(aesni_gcm_dec)
1336	push	%r12
1337	push	%r13
1338	push	%r14
1339	mov	%rsp, %r14
1340/*
1341* states of %xmm registers %xmm6:%xmm15 not saved
1342* all %xmm registers are clobbered
1343*/
1344	sub	$VARIABLE_OFFSET, %rsp
1345	and	$~63, %rsp                        # align rsp to 64 bytes
1346	mov	%arg6, %r12
1347	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1348        movdqa  SHUF_MASK(%rip), %xmm2
1349	PSHUFB_XMM %xmm2, %xmm13
1350
1351
1352# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1353
1354	movdqa	%xmm13, %xmm2
1355	psllq	$1, %xmm13
1356	psrlq	$63, %xmm2
1357	movdqa	%xmm2, %xmm1
1358	pslldq	$8, %xmm2
1359	psrldq	$8, %xmm1
1360	por	%xmm2, %xmm13
1361
1362        # Reduction
1363
1364	pshufd	$0x24, %xmm1, %xmm2
1365	pcmpeqd TWOONE(%rip), %xmm2
1366	pand	POLY(%rip), %xmm2
1367	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1368
1369
1370        # Decrypt first few blocks
1371
1372	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1373	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1374	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1375	mov %r13, %r12
1376	and $(3<<4), %r12
1377	jz _initial_num_blocks_is_0_decrypt
1378	cmp $(2<<4), %r12
1379	jb _initial_num_blocks_is_1_decrypt
1380	je _initial_num_blocks_is_2_decrypt
1381_initial_num_blocks_is_3_decrypt:
1382	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1383%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1384	sub	$48, %r13
1385	jmp	_initial_blocks_decrypted
1386_initial_num_blocks_is_2_decrypt:
1387	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1388%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1389	sub	$32, %r13
1390	jmp	_initial_blocks_decrypted
1391_initial_num_blocks_is_1_decrypt:
1392	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1393%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1394	sub	$16, %r13
1395	jmp	_initial_blocks_decrypted
1396_initial_num_blocks_is_0_decrypt:
1397	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1398%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1399_initial_blocks_decrypted:
1400	cmp	$0, %r13
1401	je	_zero_cipher_left_decrypt
1402	sub	$64, %r13
1403	je	_four_cipher_left_decrypt
1404_decrypt_by_4:
1405	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1406%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1407	add	$64, %r11
1408	sub	$64, %r13
1409	jne	_decrypt_by_4
1410_four_cipher_left_decrypt:
1411	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1412%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1413_zero_cipher_left_decrypt:
1414	mov	%arg4, %r13
1415	and	$15, %r13				# %r13 = arg4 (mod 16)
1416	je	_multiple_of_16_bytes_decrypt
1417
1418        # Handle the last <16 byte block separately
1419
1420	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1421        movdqa SHUF_MASK(%rip), %xmm10
1422	PSHUFB_XMM %xmm10, %xmm0
1423
1424	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1425
1426	lea (%arg3,%r11,1), %r10
1427	mov %r13, %r12
1428	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1429
1430	lea ALL_F+16(%rip), %r12
1431	sub %r13, %r12
1432	movdqa  %xmm1, %xmm2
1433	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1434	movdqu (%r12), %xmm1
1435	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1436	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1437	pand    %xmm1, %xmm2
1438        movdqa SHUF_MASK(%rip), %xmm10
1439	PSHUFB_XMM %xmm10 ,%xmm2
1440
1441	pxor %xmm2, %xmm8
1442	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1443
1444        # output %r13 bytes
1445	MOVQ_R64_XMM	%xmm0, %rax
1446	cmp	$8, %r13
1447	jle	_less_than_8_bytes_left_decrypt
1448	mov	%rax, (%arg2 , %r11, 1)
1449	add	$8, %r11
1450	psrldq	$8, %xmm0
1451	MOVQ_R64_XMM	%xmm0, %rax
1452	sub	$8, %r13
1453_less_than_8_bytes_left_decrypt:
1454	mov	%al,  (%arg2, %r11, 1)
1455	add	$1, %r11
1456	shr	$8, %rax
1457	sub	$1, %r13
1458	jne	_less_than_8_bytes_left_decrypt
1459_multiple_of_16_bytes_decrypt:
1460	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1461	shl	$3, %r12		  # convert into number of bits
1462	movd	%r12d, %xmm15		  # len(A) in %xmm15
1463	shl	$3, %arg4		  # len(C) in bits (*128)
1464	MOVQ_R64_XMM	%arg4, %xmm1
1465	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1466	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1467	pxor	%xmm15, %xmm8
1468	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1469	         # final GHASH computation
1470        movdqa SHUF_MASK(%rip), %xmm10
1471	PSHUFB_XMM %xmm10, %xmm8
1472
1473	mov	%arg5, %rax		  # %rax = *Y0
1474	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1475	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1476	pxor	%xmm8, %xmm0
1477_return_T_decrypt:
1478	mov	arg9, %r10                # %r10 = authTag
1479	mov	arg10, %r11               # %r11 = auth_tag_len
1480	cmp	$16, %r11
1481	je	_T_16_decrypt
1482	cmp	$8, %r11
1483	jl	_T_4_decrypt
1484_T_8_decrypt:
1485	MOVQ_R64_XMM	%xmm0, %rax
1486	mov	%rax, (%r10)
1487	add	$8, %r10
1488	sub	$8, %r11
1489	psrldq	$8, %xmm0
1490	cmp	$0, %r11
1491	je	_return_T_done_decrypt
1492_T_4_decrypt:
1493	movd	%xmm0, %eax
1494	mov	%eax, (%r10)
1495	add	$4, %r10
1496	sub	$4, %r11
1497	psrldq	$4, %xmm0
1498	cmp	$0, %r11
1499	je	_return_T_done_decrypt
1500_T_123_decrypt:
1501	movd	%xmm0, %eax
1502	cmp	$2, %r11
1503	jl	_T_1_decrypt
1504	mov	%ax, (%r10)
1505	cmp	$2, %r11
1506	je	_return_T_done_decrypt
1507	add	$2, %r10
1508	sar	$16, %eax
1509_T_1_decrypt:
1510	mov	%al, (%r10)
1511	jmp	_return_T_done_decrypt
1512_T_16_decrypt:
1513	movdqu	%xmm0, (%r10)
1514_return_T_done_decrypt:
1515	mov	%r14, %rsp
1516	pop	%r14
1517	pop	%r13
1518	pop	%r12
1519	ret
1520ENDPROC(aesni_gcm_dec)
1521
1522
1523/*****************************************************************************
1524* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1525*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1526*                    const u8 *in,       // Plaintext input
1527*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1528*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1529*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1530*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1531*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1532*                    const u8 *aad,      // Additional Authentication Data (AAD)
1533*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1534*                    u8 *auth_tag,       // Authenticated Tag output.
1535*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1536*                                        // 12 or 8.
1537*
1538* Assumptions:
1539*
1540* keys:
1541*       keys are pre-expanded and aligned to 16 bytes. we are using the
1542*       first set of 11 keys in the data structure void *aes_ctx
1543*
1544*
1545* iv:
1546*       0                   1                   2                   3
1547*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1548*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*       |                             Salt  (From the SA)               |
1550*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1551*       |                     Initialization Vector                     |
1552*       |         (This is the sequence number from IPSec header)       |
1553*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554*       |                              0x1                              |
1555*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1556*
1557*
1558*
1559* AAD:
1560*       AAD padded to 128 bits with 0
1561*       for example, assume AAD is a u32 vector
1562*
1563*       if AAD is 8 bytes:
1564*       AAD[3] = {A0, A1};
1565*       padded AAD in xmm register = {A1 A0 0 0}
1566*
1567*       0                   1                   2                   3
1568*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1569*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*       |                               SPI (A1)                        |
1571*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572*       |                     32-bit Sequence Number (A0)               |
1573*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574*       |                              0x0                              |
1575*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1576*
1577*                                 AAD Format with 32-bit Sequence Number
1578*
1579*       if AAD is 12 bytes:
1580*       AAD[3] = {A0, A1, A2};
1581*       padded AAD in xmm register = {A2 A1 A0 0}
1582*
1583*       0                   1                   2                   3
1584*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586*       |                               SPI (A2)                        |
1587*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*       |                 64-bit Extended Sequence Number {A1,A0}       |
1589*       |                                                               |
1590*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591*       |                              0x0                              |
1592*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594*                         AAD Format with 64-bit Extended Sequence Number
1595*
1596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597***************************************************************************/
1598ENTRY(aesni_gcm_enc)
1599	push	%r12
1600	push	%r13
1601	push	%r14
1602	mov	%rsp, %r14
1603#
1604# states of %xmm registers %xmm6:%xmm15 not saved
1605# all %xmm registers are clobbered
1606#
1607	sub	$VARIABLE_OFFSET, %rsp
1608	and	$~63, %rsp
1609	mov	%arg6, %r12
1610	movdqu	(%r12), %xmm13
1611        movdqa  SHUF_MASK(%rip), %xmm2
1612	PSHUFB_XMM %xmm2, %xmm13
1613
1614
1615# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1616
1617	movdqa	%xmm13, %xmm2
1618	psllq	$1, %xmm13
1619	psrlq	$63, %xmm2
1620	movdqa	%xmm2, %xmm1
1621	pslldq	$8, %xmm2
1622	psrldq	$8, %xmm1
1623	por	%xmm2, %xmm13
1624
1625        # reduce HashKey<<1
1626
1627	pshufd	$0x24, %xmm1, %xmm2
1628	pcmpeqd TWOONE(%rip), %xmm2
1629	pand	POLY(%rip), %xmm2
1630	pxor	%xmm2, %xmm13
1631	movdqa	%xmm13, HashKey(%rsp)
1632	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1633	and	$-16, %r13
1634	mov	%r13, %r12
1635
1636        # Encrypt first few blocks
1637
1638	and	$(3<<4), %r12
1639	jz	_initial_num_blocks_is_0_encrypt
1640	cmp	$(2<<4), %r12
1641	jb	_initial_num_blocks_is_1_encrypt
1642	je	_initial_num_blocks_is_2_encrypt
1643_initial_num_blocks_is_3_encrypt:
1644	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1645%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1646	sub	$48, %r13
1647	jmp	_initial_blocks_encrypted
1648_initial_num_blocks_is_2_encrypt:
1649	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1650%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1651	sub	$32, %r13
1652	jmp	_initial_blocks_encrypted
1653_initial_num_blocks_is_1_encrypt:
1654	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1655%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1656	sub	$16, %r13
1657	jmp	_initial_blocks_encrypted
1658_initial_num_blocks_is_0_encrypt:
1659	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1660%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1661_initial_blocks_encrypted:
1662
1663        # Main loop - Encrypt remaining blocks
1664
1665	cmp	$0, %r13
1666	je	_zero_cipher_left_encrypt
1667	sub	$64, %r13
1668	je	_four_cipher_left_encrypt
1669_encrypt_by_4_encrypt:
1670	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1671%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1672	add	$64, %r11
1673	sub	$64, %r13
1674	jne	_encrypt_by_4_encrypt
1675_four_cipher_left_encrypt:
1676	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1677%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1678_zero_cipher_left_encrypt:
1679	mov	%arg4, %r13
1680	and	$15, %r13			# %r13 = arg4 (mod 16)
1681	je	_multiple_of_16_bytes_encrypt
1682
1683         # Handle the last <16 Byte block separately
1684	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1685        movdqa SHUF_MASK(%rip), %xmm10
1686	PSHUFB_XMM %xmm10, %xmm0
1687
1688	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1689
1690	lea (%arg3,%r11,1), %r10
1691	mov %r13, %r12
1692	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1693
1694	lea ALL_F+16(%rip), %r12
1695	sub %r13, %r12
1696	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1697	movdqu	(%r12), %xmm1
1698	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1699	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1700        movdqa SHUF_MASK(%rip), %xmm10
1701	PSHUFB_XMM %xmm10,%xmm0
1702
1703	pxor	%xmm0, %xmm8
1704	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1705	# GHASH computation for the last <16 byte block
1706	movdqa SHUF_MASK(%rip), %xmm10
1707	PSHUFB_XMM %xmm10, %xmm0
1708
1709	# shuffle xmm0 back to output as ciphertext
1710
1711        # Output %r13 bytes
1712	MOVQ_R64_XMM %xmm0, %rax
1713	cmp $8, %r13
1714	jle _less_than_8_bytes_left_encrypt
1715	mov %rax, (%arg2 , %r11, 1)
1716	add $8, %r11
1717	psrldq $8, %xmm0
1718	MOVQ_R64_XMM %xmm0, %rax
1719	sub $8, %r13
1720_less_than_8_bytes_left_encrypt:
1721	mov %al,  (%arg2, %r11, 1)
1722	add $1, %r11
1723	shr $8, %rax
1724	sub $1, %r13
1725	jne _less_than_8_bytes_left_encrypt
1726_multiple_of_16_bytes_encrypt:
1727	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1728	shl	$3, %r12
1729	movd	%r12d, %xmm15       # len(A) in %xmm15
1730	shl	$3, %arg4               # len(C) in bits (*128)
1731	MOVQ_R64_XMM	%arg4, %xmm1
1732	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1733	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1734	pxor	%xmm15, %xmm8
1735	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1736	# final GHASH computation
1737        movdqa SHUF_MASK(%rip), %xmm10
1738	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1739
1740	mov	%arg5, %rax		       # %rax  = *Y0
1741	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1742	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1743	pxor	%xmm8, %xmm0
1744_return_T_encrypt:
1745	mov	arg9, %r10                     # %r10 = authTag
1746	mov	arg10, %r11                    # %r11 = auth_tag_len
1747	cmp	$16, %r11
1748	je	_T_16_encrypt
1749	cmp	$8, %r11
1750	jl	_T_4_encrypt
1751_T_8_encrypt:
1752	MOVQ_R64_XMM	%xmm0, %rax
1753	mov	%rax, (%r10)
1754	add	$8, %r10
1755	sub	$8, %r11
1756	psrldq	$8, %xmm0
1757	cmp	$0, %r11
1758	je	_return_T_done_encrypt
1759_T_4_encrypt:
1760	movd	%xmm0, %eax
1761	mov	%eax, (%r10)
1762	add	$4, %r10
1763	sub	$4, %r11
1764	psrldq	$4, %xmm0
1765	cmp	$0, %r11
1766	je	_return_T_done_encrypt
1767_T_123_encrypt:
1768	movd	%xmm0, %eax
1769	cmp	$2, %r11
1770	jl	_T_1_encrypt
1771	mov	%ax, (%r10)
1772	cmp	$2, %r11
1773	je	_return_T_done_encrypt
1774	add	$2, %r10
1775	sar	$16, %eax
1776_T_1_encrypt:
1777	mov	%al, (%r10)
1778	jmp	_return_T_done_encrypt
1779_T_16_encrypt:
1780	movdqu	%xmm0, (%r10)
1781_return_T_done_encrypt:
1782	mov	%r14, %rsp
1783	pop	%r14
1784	pop	%r13
1785	pop	%r12
1786	ret
1787ENDPROC(aesni_gcm_enc)
1788
1789#endif
1790
1791
1792.align 4
1793_key_expansion_128:
1794_key_expansion_256a:
1795	pshufd $0b11111111, %xmm1, %xmm1
1796	shufps $0b00010000, %xmm0, %xmm4
1797	pxor %xmm4, %xmm0
1798	shufps $0b10001100, %xmm0, %xmm4
1799	pxor %xmm4, %xmm0
1800	pxor %xmm1, %xmm0
1801	movaps %xmm0, (TKEYP)
1802	add $0x10, TKEYP
1803	ret
1804ENDPROC(_key_expansion_128)
1805ENDPROC(_key_expansion_256a)
1806
1807.align 4
1808_key_expansion_192a:
1809	pshufd $0b01010101, %xmm1, %xmm1
1810	shufps $0b00010000, %xmm0, %xmm4
1811	pxor %xmm4, %xmm0
1812	shufps $0b10001100, %xmm0, %xmm4
1813	pxor %xmm4, %xmm0
1814	pxor %xmm1, %xmm0
1815
1816	movaps %xmm2, %xmm5
1817	movaps %xmm2, %xmm6
1818	pslldq $4, %xmm5
1819	pshufd $0b11111111, %xmm0, %xmm3
1820	pxor %xmm3, %xmm2
1821	pxor %xmm5, %xmm2
1822
1823	movaps %xmm0, %xmm1
1824	shufps $0b01000100, %xmm0, %xmm6
1825	movaps %xmm6, (TKEYP)
1826	shufps $0b01001110, %xmm2, %xmm1
1827	movaps %xmm1, 0x10(TKEYP)
1828	add $0x20, TKEYP
1829	ret
1830ENDPROC(_key_expansion_192a)
1831
1832.align 4
1833_key_expansion_192b:
1834	pshufd $0b01010101, %xmm1, %xmm1
1835	shufps $0b00010000, %xmm0, %xmm4
1836	pxor %xmm4, %xmm0
1837	shufps $0b10001100, %xmm0, %xmm4
1838	pxor %xmm4, %xmm0
1839	pxor %xmm1, %xmm0
1840
1841	movaps %xmm2, %xmm5
1842	pslldq $4, %xmm5
1843	pshufd $0b11111111, %xmm0, %xmm3
1844	pxor %xmm3, %xmm2
1845	pxor %xmm5, %xmm2
1846
1847	movaps %xmm0, (TKEYP)
1848	add $0x10, TKEYP
1849	ret
1850ENDPROC(_key_expansion_192b)
1851
1852.align 4
1853_key_expansion_256b:
1854	pshufd $0b10101010, %xmm1, %xmm1
1855	shufps $0b00010000, %xmm2, %xmm4
1856	pxor %xmm4, %xmm2
1857	shufps $0b10001100, %xmm2, %xmm4
1858	pxor %xmm4, %xmm2
1859	pxor %xmm1, %xmm2
1860	movaps %xmm2, (TKEYP)
1861	add $0x10, TKEYP
1862	ret
1863ENDPROC(_key_expansion_256b)
1864
1865/*
1866 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1867 *                   unsigned int key_len)
1868 */
1869ENTRY(aesni_set_key)
1870	FRAME_BEGIN
1871#ifndef __x86_64__
1872	pushl KEYP
1873	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1874	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1875	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1876#endif
1877	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1878	movaps %xmm0, (KEYP)
1879	lea 0x10(KEYP), TKEYP		# key addr
1880	movl %edx, 480(KEYP)
1881	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1882	cmp $24, %dl
1883	jb .Lenc_key128
1884	je .Lenc_key192
1885	movups 0x10(UKEYP), %xmm2	# other user key
1886	movaps %xmm2, (TKEYP)
1887	add $0x10, TKEYP
1888	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1889	call _key_expansion_256a
1890	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1891	call _key_expansion_256b
1892	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1893	call _key_expansion_256a
1894	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1895	call _key_expansion_256b
1896	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1897	call _key_expansion_256a
1898	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1899	call _key_expansion_256b
1900	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1901	call _key_expansion_256a
1902	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1903	call _key_expansion_256b
1904	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1905	call _key_expansion_256a
1906	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1907	call _key_expansion_256b
1908	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1909	call _key_expansion_256a
1910	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1911	call _key_expansion_256b
1912	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1913	call _key_expansion_256a
1914	jmp .Ldec_key
1915.Lenc_key192:
1916	movq 0x10(UKEYP), %xmm2		# other user key
1917	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1918	call _key_expansion_192a
1919	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1920	call _key_expansion_192b
1921	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1922	call _key_expansion_192a
1923	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1924	call _key_expansion_192b
1925	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1926	call _key_expansion_192a
1927	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1928	call _key_expansion_192b
1929	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1930	call _key_expansion_192a
1931	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1932	call _key_expansion_192b
1933	jmp .Ldec_key
1934.Lenc_key128:
1935	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1936	call _key_expansion_128
1937	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1938	call _key_expansion_128
1939	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1940	call _key_expansion_128
1941	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1942	call _key_expansion_128
1943	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1944	call _key_expansion_128
1945	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1946	call _key_expansion_128
1947	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1948	call _key_expansion_128
1949	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1950	call _key_expansion_128
1951	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1952	call _key_expansion_128
1953	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1954	call _key_expansion_128
1955.Ldec_key:
1956	sub $0x10, TKEYP
1957	movaps (KEYP), %xmm0
1958	movaps (TKEYP), %xmm1
1959	movaps %xmm0, 240(TKEYP)
1960	movaps %xmm1, 240(KEYP)
1961	add $0x10, KEYP
1962	lea 240-16(TKEYP), UKEYP
1963.align 4
1964.Ldec_key_loop:
1965	movaps (KEYP), %xmm0
1966	AESIMC %xmm0 %xmm1
1967	movaps %xmm1, (UKEYP)
1968	add $0x10, KEYP
1969	sub $0x10, UKEYP
1970	cmp TKEYP, KEYP
1971	jb .Ldec_key_loop
1972	xor AREG, AREG
1973#ifndef __x86_64__
1974	popl KEYP
1975#endif
1976	FRAME_END
1977	ret
1978ENDPROC(aesni_set_key)
1979
1980/*
1981 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1982 */
1983ENTRY(aesni_enc)
1984	FRAME_BEGIN
1985#ifndef __x86_64__
1986	pushl KEYP
1987	pushl KLEN
1988	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1989	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1990	movl (FRAME_OFFSET+20)(%esp), INP	# src
1991#endif
1992	movl 480(KEYP), KLEN		# key length
1993	movups (INP), STATE		# input
1994	call _aesni_enc1
1995	movups STATE, (OUTP)		# output
1996#ifndef __x86_64__
1997	popl KLEN
1998	popl KEYP
1999#endif
2000	FRAME_END
2001	ret
2002ENDPROC(aesni_enc)
2003
2004/*
2005 * _aesni_enc1:		internal ABI
2006 * input:
2007 *	KEYP:		key struct pointer
2008 *	KLEN:		round count
2009 *	STATE:		initial state (input)
2010 * output:
2011 *	STATE:		finial state (output)
2012 * changed:
2013 *	KEY
2014 *	TKEYP (T1)
2015 */
2016.align 4
2017_aesni_enc1:
2018	movaps (KEYP), KEY		# key
2019	mov KEYP, TKEYP
2020	pxor KEY, STATE		# round 0
2021	add $0x30, TKEYP
2022	cmp $24, KLEN
2023	jb .Lenc128
2024	lea 0x20(TKEYP), TKEYP
2025	je .Lenc192
2026	add $0x20, TKEYP
2027	movaps -0x60(TKEYP), KEY
2028	AESENC KEY STATE
2029	movaps -0x50(TKEYP), KEY
2030	AESENC KEY STATE
2031.align 4
2032.Lenc192:
2033	movaps -0x40(TKEYP), KEY
2034	AESENC KEY STATE
2035	movaps -0x30(TKEYP), KEY
2036	AESENC KEY STATE
2037.align 4
2038.Lenc128:
2039	movaps -0x20(TKEYP), KEY
2040	AESENC KEY STATE
2041	movaps -0x10(TKEYP), KEY
2042	AESENC KEY STATE
2043	movaps (TKEYP), KEY
2044	AESENC KEY STATE
2045	movaps 0x10(TKEYP), KEY
2046	AESENC KEY STATE
2047	movaps 0x20(TKEYP), KEY
2048	AESENC KEY STATE
2049	movaps 0x30(TKEYP), KEY
2050	AESENC KEY STATE
2051	movaps 0x40(TKEYP), KEY
2052	AESENC KEY STATE
2053	movaps 0x50(TKEYP), KEY
2054	AESENC KEY STATE
2055	movaps 0x60(TKEYP), KEY
2056	AESENC KEY STATE
2057	movaps 0x70(TKEYP), KEY
2058	AESENCLAST KEY STATE
2059	ret
2060ENDPROC(_aesni_enc1)
2061
2062/*
2063 * _aesni_enc4:	internal ABI
2064 * input:
2065 *	KEYP:		key struct pointer
2066 *	KLEN:		round count
2067 *	STATE1:		initial state (input)
2068 *	STATE2
2069 *	STATE3
2070 *	STATE4
2071 * output:
2072 *	STATE1:		finial state (output)
2073 *	STATE2
2074 *	STATE3
2075 *	STATE4
2076 * changed:
2077 *	KEY
2078 *	TKEYP (T1)
2079 */
2080.align 4
2081_aesni_enc4:
2082	movaps (KEYP), KEY		# key
2083	mov KEYP, TKEYP
2084	pxor KEY, STATE1		# round 0
2085	pxor KEY, STATE2
2086	pxor KEY, STATE3
2087	pxor KEY, STATE4
2088	add $0x30, TKEYP
2089	cmp $24, KLEN
2090	jb .L4enc128
2091	lea 0x20(TKEYP), TKEYP
2092	je .L4enc192
2093	add $0x20, TKEYP
2094	movaps -0x60(TKEYP), KEY
2095	AESENC KEY STATE1
2096	AESENC KEY STATE2
2097	AESENC KEY STATE3
2098	AESENC KEY STATE4
2099	movaps -0x50(TKEYP), KEY
2100	AESENC KEY STATE1
2101	AESENC KEY STATE2
2102	AESENC KEY STATE3
2103	AESENC KEY STATE4
2104#.align 4
2105.L4enc192:
2106	movaps -0x40(TKEYP), KEY
2107	AESENC KEY STATE1
2108	AESENC KEY STATE2
2109	AESENC KEY STATE3
2110	AESENC KEY STATE4
2111	movaps -0x30(TKEYP), KEY
2112	AESENC KEY STATE1
2113	AESENC KEY STATE2
2114	AESENC KEY STATE3
2115	AESENC KEY STATE4
2116#.align 4
2117.L4enc128:
2118	movaps -0x20(TKEYP), KEY
2119	AESENC KEY STATE1
2120	AESENC KEY STATE2
2121	AESENC KEY STATE3
2122	AESENC KEY STATE4
2123	movaps -0x10(TKEYP), KEY
2124	AESENC KEY STATE1
2125	AESENC KEY STATE2
2126	AESENC KEY STATE3
2127	AESENC KEY STATE4
2128	movaps (TKEYP), KEY
2129	AESENC KEY STATE1
2130	AESENC KEY STATE2
2131	AESENC KEY STATE3
2132	AESENC KEY STATE4
2133	movaps 0x10(TKEYP), KEY
2134	AESENC KEY STATE1
2135	AESENC KEY STATE2
2136	AESENC KEY STATE3
2137	AESENC KEY STATE4
2138	movaps 0x20(TKEYP), KEY
2139	AESENC KEY STATE1
2140	AESENC KEY STATE2
2141	AESENC KEY STATE3
2142	AESENC KEY STATE4
2143	movaps 0x30(TKEYP), KEY
2144	AESENC KEY STATE1
2145	AESENC KEY STATE2
2146	AESENC KEY STATE3
2147	AESENC KEY STATE4
2148	movaps 0x40(TKEYP), KEY
2149	AESENC KEY STATE1
2150	AESENC KEY STATE2
2151	AESENC KEY STATE3
2152	AESENC KEY STATE4
2153	movaps 0x50(TKEYP), KEY
2154	AESENC KEY STATE1
2155	AESENC KEY STATE2
2156	AESENC KEY STATE3
2157	AESENC KEY STATE4
2158	movaps 0x60(TKEYP), KEY
2159	AESENC KEY STATE1
2160	AESENC KEY STATE2
2161	AESENC KEY STATE3
2162	AESENC KEY STATE4
2163	movaps 0x70(TKEYP), KEY
2164	AESENCLAST KEY STATE1		# last round
2165	AESENCLAST KEY STATE2
2166	AESENCLAST KEY STATE3
2167	AESENCLAST KEY STATE4
2168	ret
2169ENDPROC(_aesni_enc4)
2170
2171/*
2172 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2173 */
2174ENTRY(aesni_dec)
2175	FRAME_BEGIN
2176#ifndef __x86_64__
2177	pushl KEYP
2178	pushl KLEN
2179	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2180	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2181	movl (FRAME_OFFSET+20)(%esp), INP	# src
2182#endif
2183	mov 480(KEYP), KLEN		# key length
2184	add $240, KEYP
2185	movups (INP), STATE		# input
2186	call _aesni_dec1
2187	movups STATE, (OUTP)		#output
2188#ifndef __x86_64__
2189	popl KLEN
2190	popl KEYP
2191#endif
2192	FRAME_END
2193	ret
2194ENDPROC(aesni_dec)
2195
2196/*
2197 * _aesni_dec1:		internal ABI
2198 * input:
2199 *	KEYP:		key struct pointer
2200 *	KLEN:		key length
2201 *	STATE:		initial state (input)
2202 * output:
2203 *	STATE:		finial state (output)
2204 * changed:
2205 *	KEY
2206 *	TKEYP (T1)
2207 */
2208.align 4
2209_aesni_dec1:
2210	movaps (KEYP), KEY		# key
2211	mov KEYP, TKEYP
2212	pxor KEY, STATE		# round 0
2213	add $0x30, TKEYP
2214	cmp $24, KLEN
2215	jb .Ldec128
2216	lea 0x20(TKEYP), TKEYP
2217	je .Ldec192
2218	add $0x20, TKEYP
2219	movaps -0x60(TKEYP), KEY
2220	AESDEC KEY STATE
2221	movaps -0x50(TKEYP), KEY
2222	AESDEC KEY STATE
2223.align 4
2224.Ldec192:
2225	movaps -0x40(TKEYP), KEY
2226	AESDEC KEY STATE
2227	movaps -0x30(TKEYP), KEY
2228	AESDEC KEY STATE
2229.align 4
2230.Ldec128:
2231	movaps -0x20(TKEYP), KEY
2232	AESDEC KEY STATE
2233	movaps -0x10(TKEYP), KEY
2234	AESDEC KEY STATE
2235	movaps (TKEYP), KEY
2236	AESDEC KEY STATE
2237	movaps 0x10(TKEYP), KEY
2238	AESDEC KEY STATE
2239	movaps 0x20(TKEYP), KEY
2240	AESDEC KEY STATE
2241	movaps 0x30(TKEYP), KEY
2242	AESDEC KEY STATE
2243	movaps 0x40(TKEYP), KEY
2244	AESDEC KEY STATE
2245	movaps 0x50(TKEYP), KEY
2246	AESDEC KEY STATE
2247	movaps 0x60(TKEYP), KEY
2248	AESDEC KEY STATE
2249	movaps 0x70(TKEYP), KEY
2250	AESDECLAST KEY STATE
2251	ret
2252ENDPROC(_aesni_dec1)
2253
2254/*
2255 * _aesni_dec4:	internal ABI
2256 * input:
2257 *	KEYP:		key struct pointer
2258 *	KLEN:		key length
2259 *	STATE1:		initial state (input)
2260 *	STATE2
2261 *	STATE3
2262 *	STATE4
2263 * output:
2264 *	STATE1:		finial state (output)
2265 *	STATE2
2266 *	STATE3
2267 *	STATE4
2268 * changed:
2269 *	KEY
2270 *	TKEYP (T1)
2271 */
2272.align 4
2273_aesni_dec4:
2274	movaps (KEYP), KEY		# key
2275	mov KEYP, TKEYP
2276	pxor KEY, STATE1		# round 0
2277	pxor KEY, STATE2
2278	pxor KEY, STATE3
2279	pxor KEY, STATE4
2280	add $0x30, TKEYP
2281	cmp $24, KLEN
2282	jb .L4dec128
2283	lea 0x20(TKEYP), TKEYP
2284	je .L4dec192
2285	add $0x20, TKEYP
2286	movaps -0x60(TKEYP), KEY
2287	AESDEC KEY STATE1
2288	AESDEC KEY STATE2
2289	AESDEC KEY STATE3
2290	AESDEC KEY STATE4
2291	movaps -0x50(TKEYP), KEY
2292	AESDEC KEY STATE1
2293	AESDEC KEY STATE2
2294	AESDEC KEY STATE3
2295	AESDEC KEY STATE4
2296.align 4
2297.L4dec192:
2298	movaps -0x40(TKEYP), KEY
2299	AESDEC KEY STATE1
2300	AESDEC KEY STATE2
2301	AESDEC KEY STATE3
2302	AESDEC KEY STATE4
2303	movaps -0x30(TKEYP), KEY
2304	AESDEC KEY STATE1
2305	AESDEC KEY STATE2
2306	AESDEC KEY STATE3
2307	AESDEC KEY STATE4
2308.align 4
2309.L4dec128:
2310	movaps -0x20(TKEYP), KEY
2311	AESDEC KEY STATE1
2312	AESDEC KEY STATE2
2313	AESDEC KEY STATE3
2314	AESDEC KEY STATE4
2315	movaps -0x10(TKEYP), KEY
2316	AESDEC KEY STATE1
2317	AESDEC KEY STATE2
2318	AESDEC KEY STATE3
2319	AESDEC KEY STATE4
2320	movaps (TKEYP), KEY
2321	AESDEC KEY STATE1
2322	AESDEC KEY STATE2
2323	AESDEC KEY STATE3
2324	AESDEC KEY STATE4
2325	movaps 0x10(TKEYP), KEY
2326	AESDEC KEY STATE1
2327	AESDEC KEY STATE2
2328	AESDEC KEY STATE3
2329	AESDEC KEY STATE4
2330	movaps 0x20(TKEYP), KEY
2331	AESDEC KEY STATE1
2332	AESDEC KEY STATE2
2333	AESDEC KEY STATE3
2334	AESDEC KEY STATE4
2335	movaps 0x30(TKEYP), KEY
2336	AESDEC KEY STATE1
2337	AESDEC KEY STATE2
2338	AESDEC KEY STATE3
2339	AESDEC KEY STATE4
2340	movaps 0x40(TKEYP), KEY
2341	AESDEC KEY STATE1
2342	AESDEC KEY STATE2
2343	AESDEC KEY STATE3
2344	AESDEC KEY STATE4
2345	movaps 0x50(TKEYP), KEY
2346	AESDEC KEY STATE1
2347	AESDEC KEY STATE2
2348	AESDEC KEY STATE3
2349	AESDEC KEY STATE4
2350	movaps 0x60(TKEYP), KEY
2351	AESDEC KEY STATE1
2352	AESDEC KEY STATE2
2353	AESDEC KEY STATE3
2354	AESDEC KEY STATE4
2355	movaps 0x70(TKEYP), KEY
2356	AESDECLAST KEY STATE1		# last round
2357	AESDECLAST KEY STATE2
2358	AESDECLAST KEY STATE3
2359	AESDECLAST KEY STATE4
2360	ret
2361ENDPROC(_aesni_dec4)
2362
2363/*
2364 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2365 *		      size_t len)
2366 */
2367ENTRY(aesni_ecb_enc)
2368	FRAME_BEGIN
2369#ifndef __x86_64__
2370	pushl LEN
2371	pushl KEYP
2372	pushl KLEN
2373	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2374	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2375	movl (FRAME_OFFSET+24)(%esp), INP	# src
2376	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2377#endif
2378	test LEN, LEN		# check length
2379	jz .Lecb_enc_ret
2380	mov 480(KEYP), KLEN
2381	cmp $16, LEN
2382	jb .Lecb_enc_ret
2383	cmp $64, LEN
2384	jb .Lecb_enc_loop1
2385.align 4
2386.Lecb_enc_loop4:
2387	movups (INP), STATE1
2388	movups 0x10(INP), STATE2
2389	movups 0x20(INP), STATE3
2390	movups 0x30(INP), STATE4
2391	call _aesni_enc4
2392	movups STATE1, (OUTP)
2393	movups STATE2, 0x10(OUTP)
2394	movups STATE3, 0x20(OUTP)
2395	movups STATE4, 0x30(OUTP)
2396	sub $64, LEN
2397	add $64, INP
2398	add $64, OUTP
2399	cmp $64, LEN
2400	jge .Lecb_enc_loop4
2401	cmp $16, LEN
2402	jb .Lecb_enc_ret
2403.align 4
2404.Lecb_enc_loop1:
2405	movups (INP), STATE1
2406	call _aesni_enc1
2407	movups STATE1, (OUTP)
2408	sub $16, LEN
2409	add $16, INP
2410	add $16, OUTP
2411	cmp $16, LEN
2412	jge .Lecb_enc_loop1
2413.Lecb_enc_ret:
2414#ifndef __x86_64__
2415	popl KLEN
2416	popl KEYP
2417	popl LEN
2418#endif
2419	FRAME_END
2420	ret
2421ENDPROC(aesni_ecb_enc)
2422
2423/*
2424 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2425 *		      size_t len);
2426 */
2427ENTRY(aesni_ecb_dec)
2428	FRAME_BEGIN
2429#ifndef __x86_64__
2430	pushl LEN
2431	pushl KEYP
2432	pushl KLEN
2433	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2434	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2435	movl (FRAME_OFFSET+24)(%esp), INP	# src
2436	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2437#endif
2438	test LEN, LEN
2439	jz .Lecb_dec_ret
2440	mov 480(KEYP), KLEN
2441	add $240, KEYP
2442	cmp $16, LEN
2443	jb .Lecb_dec_ret
2444	cmp $64, LEN
2445	jb .Lecb_dec_loop1
2446.align 4
2447.Lecb_dec_loop4:
2448	movups (INP), STATE1
2449	movups 0x10(INP), STATE2
2450	movups 0x20(INP), STATE3
2451	movups 0x30(INP), STATE4
2452	call _aesni_dec4
2453	movups STATE1, (OUTP)
2454	movups STATE2, 0x10(OUTP)
2455	movups STATE3, 0x20(OUTP)
2456	movups STATE4, 0x30(OUTP)
2457	sub $64, LEN
2458	add $64, INP
2459	add $64, OUTP
2460	cmp $64, LEN
2461	jge .Lecb_dec_loop4
2462	cmp $16, LEN
2463	jb .Lecb_dec_ret
2464.align 4
2465.Lecb_dec_loop1:
2466	movups (INP), STATE1
2467	call _aesni_dec1
2468	movups STATE1, (OUTP)
2469	sub $16, LEN
2470	add $16, INP
2471	add $16, OUTP
2472	cmp $16, LEN
2473	jge .Lecb_dec_loop1
2474.Lecb_dec_ret:
2475#ifndef __x86_64__
2476	popl KLEN
2477	popl KEYP
2478	popl LEN
2479#endif
2480	FRAME_END
2481	ret
2482ENDPROC(aesni_ecb_dec)
2483
2484/*
2485 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2486 *		      size_t len, u8 *iv)
2487 */
2488ENTRY(aesni_cbc_enc)
2489	FRAME_BEGIN
2490#ifndef __x86_64__
2491	pushl IVP
2492	pushl LEN
2493	pushl KEYP
2494	pushl KLEN
2495	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2496	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2497	movl (FRAME_OFFSET+28)(%esp), INP	# src
2498	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2499	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2500#endif
2501	cmp $16, LEN
2502	jb .Lcbc_enc_ret
2503	mov 480(KEYP), KLEN
2504	movups (IVP), STATE	# load iv as initial state
2505.align 4
2506.Lcbc_enc_loop:
2507	movups (INP), IN	# load input
2508	pxor IN, STATE
2509	call _aesni_enc1
2510	movups STATE, (OUTP)	# store output
2511	sub $16, LEN
2512	add $16, INP
2513	add $16, OUTP
2514	cmp $16, LEN
2515	jge .Lcbc_enc_loop
2516	movups STATE, (IVP)
2517.Lcbc_enc_ret:
2518#ifndef __x86_64__
2519	popl KLEN
2520	popl KEYP
2521	popl LEN
2522	popl IVP
2523#endif
2524	FRAME_END
2525	ret
2526ENDPROC(aesni_cbc_enc)
2527
2528/*
2529 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2530 *		      size_t len, u8 *iv)
2531 */
2532ENTRY(aesni_cbc_dec)
2533	FRAME_BEGIN
2534#ifndef __x86_64__
2535	pushl IVP
2536	pushl LEN
2537	pushl KEYP
2538	pushl KLEN
2539	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2540	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2541	movl (FRAME_OFFSET+28)(%esp), INP	# src
2542	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2543	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2544#endif
2545	cmp $16, LEN
2546	jb .Lcbc_dec_just_ret
2547	mov 480(KEYP), KLEN
2548	add $240, KEYP
2549	movups (IVP), IV
2550	cmp $64, LEN
2551	jb .Lcbc_dec_loop1
2552.align 4
2553.Lcbc_dec_loop4:
2554	movups (INP), IN1
2555	movaps IN1, STATE1
2556	movups 0x10(INP), IN2
2557	movaps IN2, STATE2
2558#ifdef __x86_64__
2559	movups 0x20(INP), IN3
2560	movaps IN3, STATE3
2561	movups 0x30(INP), IN4
2562	movaps IN4, STATE4
2563#else
2564	movups 0x20(INP), IN1
2565	movaps IN1, STATE3
2566	movups 0x30(INP), IN2
2567	movaps IN2, STATE4
2568#endif
2569	call _aesni_dec4
2570	pxor IV, STATE1
2571#ifdef __x86_64__
2572	pxor IN1, STATE2
2573	pxor IN2, STATE3
2574	pxor IN3, STATE4
2575	movaps IN4, IV
2576#else
2577	pxor IN1, STATE4
2578	movaps IN2, IV
2579	movups (INP), IN1
2580	pxor IN1, STATE2
2581	movups 0x10(INP), IN2
2582	pxor IN2, STATE3
2583#endif
2584	movups STATE1, (OUTP)
2585	movups STATE2, 0x10(OUTP)
2586	movups STATE3, 0x20(OUTP)
2587	movups STATE4, 0x30(OUTP)
2588	sub $64, LEN
2589	add $64, INP
2590	add $64, OUTP
2591	cmp $64, LEN
2592	jge .Lcbc_dec_loop4
2593	cmp $16, LEN
2594	jb .Lcbc_dec_ret
2595.align 4
2596.Lcbc_dec_loop1:
2597	movups (INP), IN
2598	movaps IN, STATE
2599	call _aesni_dec1
2600	pxor IV, STATE
2601	movups STATE, (OUTP)
2602	movaps IN, IV
2603	sub $16, LEN
2604	add $16, INP
2605	add $16, OUTP
2606	cmp $16, LEN
2607	jge .Lcbc_dec_loop1
2608.Lcbc_dec_ret:
2609	movups IV, (IVP)
2610.Lcbc_dec_just_ret:
2611#ifndef __x86_64__
2612	popl KLEN
2613	popl KEYP
2614	popl LEN
2615	popl IVP
2616#endif
2617	FRAME_END
2618	ret
2619ENDPROC(aesni_cbc_dec)
2620
2621#ifdef __x86_64__
2622.pushsection .rodata
2623.align 16
2624.Lbswap_mask:
2625	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2626.popsection
2627
2628/*
2629 * _aesni_inc_init:	internal ABI
2630 *	setup registers used by _aesni_inc
2631 * input:
2632 *	IV
2633 * output:
2634 *	CTR:	== IV, in little endian
2635 *	TCTR_LOW: == lower qword of CTR
2636 *	INC:	== 1, in little endian
2637 *	BSWAP_MASK == endian swapping mask
2638 */
2639.align 4
2640_aesni_inc_init:
2641	movaps .Lbswap_mask, BSWAP_MASK
2642	movaps IV, CTR
2643	PSHUFB_XMM BSWAP_MASK CTR
2644	mov $1, TCTR_LOW
2645	MOVQ_R64_XMM TCTR_LOW INC
2646	MOVQ_R64_XMM CTR TCTR_LOW
2647	ret
2648ENDPROC(_aesni_inc_init)
2649
2650/*
2651 * _aesni_inc:		internal ABI
2652 *	Increase IV by 1, IV is in big endian
2653 * input:
2654 *	IV
2655 *	CTR:	== IV, in little endian
2656 *	TCTR_LOW: == lower qword of CTR
2657 *	INC:	== 1, in little endian
2658 *	BSWAP_MASK == endian swapping mask
2659 * output:
2660 *	IV:	Increase by 1
2661 * changed:
2662 *	CTR:	== output IV, in little endian
2663 *	TCTR_LOW: == lower qword of CTR
2664 */
2665.align 4
2666_aesni_inc:
2667	paddq INC, CTR
2668	add $1, TCTR_LOW
2669	jnc .Linc_low
2670	pslldq $8, INC
2671	paddq INC, CTR
2672	psrldq $8, INC
2673.Linc_low:
2674	movaps CTR, IV
2675	PSHUFB_XMM BSWAP_MASK IV
2676	ret
2677ENDPROC(_aesni_inc)
2678
2679/*
2680 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2681 *		      size_t len, u8 *iv)
2682 */
2683ENTRY(aesni_ctr_enc)
2684	FRAME_BEGIN
2685	cmp $16, LEN
2686	jb .Lctr_enc_just_ret
2687	mov 480(KEYP), KLEN
2688	movups (IVP), IV
2689	call _aesni_inc_init
2690	cmp $64, LEN
2691	jb .Lctr_enc_loop1
2692.align 4
2693.Lctr_enc_loop4:
2694	movaps IV, STATE1
2695	call _aesni_inc
2696	movups (INP), IN1
2697	movaps IV, STATE2
2698	call _aesni_inc
2699	movups 0x10(INP), IN2
2700	movaps IV, STATE3
2701	call _aesni_inc
2702	movups 0x20(INP), IN3
2703	movaps IV, STATE4
2704	call _aesni_inc
2705	movups 0x30(INP), IN4
2706	call _aesni_enc4
2707	pxor IN1, STATE1
2708	movups STATE1, (OUTP)
2709	pxor IN2, STATE2
2710	movups STATE2, 0x10(OUTP)
2711	pxor IN3, STATE3
2712	movups STATE3, 0x20(OUTP)
2713	pxor IN4, STATE4
2714	movups STATE4, 0x30(OUTP)
2715	sub $64, LEN
2716	add $64, INP
2717	add $64, OUTP
2718	cmp $64, LEN
2719	jge .Lctr_enc_loop4
2720	cmp $16, LEN
2721	jb .Lctr_enc_ret
2722.align 4
2723.Lctr_enc_loop1:
2724	movaps IV, STATE
2725	call _aesni_inc
2726	movups (INP), IN
2727	call _aesni_enc1
2728	pxor IN, STATE
2729	movups STATE, (OUTP)
2730	sub $16, LEN
2731	add $16, INP
2732	add $16, OUTP
2733	cmp $16, LEN
2734	jge .Lctr_enc_loop1
2735.Lctr_enc_ret:
2736	movups IV, (IVP)
2737.Lctr_enc_just_ret:
2738	FRAME_END
2739	ret
2740ENDPROC(aesni_ctr_enc)
2741
2742/*
2743 * _aesni_gf128mul_x_ble:		internal ABI
2744 *	Multiply in GF(2^128) for XTS IVs
2745 * input:
2746 *	IV:	current IV
2747 *	GF128MUL_MASK == mask with 0x87 and 0x01
2748 * output:
2749 *	IV:	next IV
2750 * changed:
2751 *	CTR:	== temporary value
2752 */
2753#define _aesni_gf128mul_x_ble() \
2754	pshufd $0x13, IV, CTR; \
2755	paddq IV, IV; \
2756	psrad $31, CTR; \
2757	pand GF128MUL_MASK, CTR; \
2758	pxor CTR, IV;
2759
2760/*
2761 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2762 *			 bool enc, u8 *iv)
2763 */
2764ENTRY(aesni_xts_crypt8)
2765	FRAME_BEGIN
2766	cmpb $0, %cl
2767	movl $0, %ecx
2768	movl $240, %r10d
2769	leaq _aesni_enc4, %r11
2770	leaq _aesni_dec4, %rax
2771	cmovel %r10d, %ecx
2772	cmoveq %rax, %r11
2773
2774	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2775	movups (IVP), IV
2776
2777	mov 480(KEYP), KLEN
2778	addq %rcx, KEYP
2779
2780	movdqa IV, STATE1
2781	movdqu 0x00(INP), INC
2782	pxor INC, STATE1
2783	movdqu IV, 0x00(OUTP)
2784
2785	_aesni_gf128mul_x_ble()
2786	movdqa IV, STATE2
2787	movdqu 0x10(INP), INC
2788	pxor INC, STATE2
2789	movdqu IV, 0x10(OUTP)
2790
2791	_aesni_gf128mul_x_ble()
2792	movdqa IV, STATE3
2793	movdqu 0x20(INP), INC
2794	pxor INC, STATE3
2795	movdqu IV, 0x20(OUTP)
2796
2797	_aesni_gf128mul_x_ble()
2798	movdqa IV, STATE4
2799	movdqu 0x30(INP), INC
2800	pxor INC, STATE4
2801	movdqu IV, 0x30(OUTP)
2802
2803	CALL_NOSPEC %r11
2804
2805	movdqu 0x00(OUTP), INC
2806	pxor INC, STATE1
2807	movdqu STATE1, 0x00(OUTP)
2808
2809	_aesni_gf128mul_x_ble()
2810	movdqa IV, STATE1
2811	movdqu 0x40(INP), INC
2812	pxor INC, STATE1
2813	movdqu IV, 0x40(OUTP)
2814
2815	movdqu 0x10(OUTP), INC
2816	pxor INC, STATE2
2817	movdqu STATE2, 0x10(OUTP)
2818
2819	_aesni_gf128mul_x_ble()
2820	movdqa IV, STATE2
2821	movdqu 0x50(INP), INC
2822	pxor INC, STATE2
2823	movdqu IV, 0x50(OUTP)
2824
2825	movdqu 0x20(OUTP), INC
2826	pxor INC, STATE3
2827	movdqu STATE3, 0x20(OUTP)
2828
2829	_aesni_gf128mul_x_ble()
2830	movdqa IV, STATE3
2831	movdqu 0x60(INP), INC
2832	pxor INC, STATE3
2833	movdqu IV, 0x60(OUTP)
2834
2835	movdqu 0x30(OUTP), INC
2836	pxor INC, STATE4
2837	movdqu STATE4, 0x30(OUTP)
2838
2839	_aesni_gf128mul_x_ble()
2840	movdqa IV, STATE4
2841	movdqu 0x70(INP), INC
2842	pxor INC, STATE4
2843	movdqu IV, 0x70(OUTP)
2844
2845	_aesni_gf128mul_x_ble()
2846	movups IV, (IVP)
2847
2848	CALL_NOSPEC %r11
2849
2850	movdqu 0x40(OUTP), INC
2851	pxor INC, STATE1
2852	movdqu STATE1, 0x40(OUTP)
2853
2854	movdqu 0x50(OUTP), INC
2855	pxor INC, STATE2
2856	movdqu STATE2, 0x50(OUTP)
2857
2858	movdqu 0x60(OUTP), INC
2859	pxor INC, STATE3
2860	movdqu STATE3, 0x60(OUTP)
2861
2862	movdqu 0x70(OUTP), INC
2863	pxor INC, STATE4
2864	movdqu STATE4, 0x70(OUTP)
2865
2866	FRAME_END
2867	ret
2868ENDPROC(aesni_xts_crypt8)
2869
2870#endif
2871