xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision ce932d0c5589e9766e089c22c66890dfc48fbd94)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35#ifdef __x86_64__
36.data
37POLY:   .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1:      .octa 0x0000000000000000ffffffffffffffff
46MASK2:      .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
49ZERO:       .octa 0x00000000000000000000000000000000
50ONE:        .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec:        .octa 0x1
53enc:        .octa 0x2
54
55
56.text
57
58
59#define	STACK_OFFSET    8*3
60#define	HashKey		16*0	// store HashKey <<1 mod poly here
61#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
62#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
63#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
64#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
65				// bits of  HashKey <<1 mod poly here
66				//(for Karatsuba purposes)
67#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
68				// bits of  HashKey^2 <<1 mod poly here
69				// (for Karatsuba purposes)
70#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
71				// bits of  HashKey^3 <<1 mod poly here
72				// (for Karatsuba purposes)
73#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
74				// bits of  HashKey^4 <<1 mod poly here
75				// (for Karatsuba purposes)
76#define	VARIABLE_OFFSET	16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
91#define STATE1	%xmm0
92#define STATE2	%xmm4
93#define STATE3	%xmm5
94#define STATE4	%xmm6
95#define STATE	STATE1
96#define IN1	%xmm1
97#define IN2	%xmm7
98#define IN3	%xmm8
99#define IN4	%xmm9
100#define IN	IN1
101#define KEY	%xmm2
102#define IV	%xmm3
103
104#define BSWAP_MASK %xmm10
105#define CTR	%xmm11
106#define INC	%xmm12
107
108#ifdef __x86_64__
109#define AREG	%rax
110#define KEYP	%rdi
111#define OUTP	%rsi
112#define UKEYP	OUTP
113#define INP	%rdx
114#define LEN	%rcx
115#define IVP	%r8
116#define KLEN	%r9d
117#define T1	%r10
118#define TKEYP	T1
119#define T2	%r11
120#define TCTR_LOW T2
121#else
122#define AREG	%eax
123#define KEYP	%edi
124#define OUTP	AREG
125#define UKEYP	OUTP
126#define INP	%edx
127#define LEN	%esi
128#define IVP	%ebp
129#define KLEN	%ebx
130#define T1	%ecx
131#define TKEYP	T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146	movdqa	  \GH, \TMP1
147	pshufd	  $78, \GH, \TMP2
148	pshufd	  $78, \HK, \TMP3
149	pxor	  \GH, \TMP2            # TMP2 = a1+a0
150	pxor	  \HK, \TMP3            # TMP3 = b1+b0
151	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
152	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
153	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
154	pxor	  \GH, \TMP2
155	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
156	movdqa	  \TMP2, \TMP3
157	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
158	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
159	pxor	  \TMP3, \GH
160	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
161
162        # first phase of the reduction
163
164	movdqa    \GH, \TMP2
165	movdqa    \GH, \TMP3
166	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
167					# in in order to perform
168					# independent shifts
169	pslld     $31, \TMP2            # packed right shift <<31
170	pslld     $30, \TMP3            # packed right shift <<30
171	pslld     $25, \TMP4            # packed right shift <<25
172	pxor      \TMP3, \TMP2          # xor the shifted versions
173	pxor      \TMP4, \TMP2
174	movdqa    \TMP2, \TMP5
175	psrldq    $4, \TMP5             # right shift TMP5 1 DW
176	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
177	pxor      \TMP2, \GH
178
179        # second phase of the reduction
180
181	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
182					# in in order to perform
183					# independent shifts
184	movdqa    \GH,\TMP3
185	movdqa    \GH,\TMP4
186	psrld     $1,\TMP2              # packed left shift >>1
187	psrld     $2,\TMP3              # packed left shift >>2
188	psrld     $7,\TMP4              # packed left shift >>7
189	pxor      \TMP3,\TMP2		# xor the shifted versions
190	pxor      \TMP4,\TMP2
191	pxor      \TMP5, \TMP2
192	pxor      \TMP2, \GH
193	pxor      \TMP1, \GH            # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210	mov	   arg7, %r10           # %r10 = AAD
211	mov	   arg8, %r12           # %r12 = aadLen
212	mov	   %r12, %r11
213	pxor	   %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215	movd	   (%r10), \TMP1
216	pslldq	   $12, \TMP1
217	psrldq	   $4, %xmm\i
218	pxor	   \TMP1, %xmm\i
219	add	   $4, %r10
220	sub	   $4, %r12
221	jne	   _get_AAD_loop\num_initial_blocks\operation
222	cmp	   $16, %r11
223	je	   _get_AAD_loop2_done\num_initial_blocks\operation
224	mov	   $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226	psrldq	   $4, %xmm\i
227	sub	   $4, %r12
228	cmp	   %r11, %r12
229	jne	   _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231        movdqa     SHUF_MASK(%rip), %xmm14
232	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
233
234	xor	   %r11, %r11 # initialise the data pointer offset as zero
235
236        # start AES for num_initial_blocks blocks
237
238	mov	   %arg5, %rax                      # %rax = *Y0
239	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
240        movdqa     SHUF_MASK(%rip), %xmm14
241	PSHUFB_XMM   %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245	paddd	   ONE(%rip), \XMM0                 # INCR Y0
246	movdqa	   \XMM0, %xmm\index
247        movdqa     SHUF_MASK(%rip), %xmm14
248	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252	pxor	   16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255	movaps 0x10(%rdi), \TMP1
256	AESENC     \TMP1, %xmm\index          # Round 1
257.endr
258.irpc index, \i_seq
259	movaps 0x20(%arg1), \TMP1
260	AESENC     \TMP1, %xmm\index          # Round 2
261.endr
262.irpc index, \i_seq
263	movaps 0x30(%arg1), \TMP1
264	AESENC     \TMP1, %xmm\index          # Round 2
265.endr
266.irpc index, \i_seq
267	movaps 0x40(%arg1), \TMP1
268	AESENC     \TMP1, %xmm\index          # Round 2
269.endr
270.irpc index, \i_seq
271	movaps 0x50(%arg1), \TMP1
272	AESENC     \TMP1, %xmm\index          # Round 2
273.endr
274.irpc index, \i_seq
275	movaps 0x60(%arg1), \TMP1
276	AESENC     \TMP1, %xmm\index          # Round 2
277.endr
278.irpc index, \i_seq
279	movaps 0x70(%arg1), \TMP1
280	AESENC     \TMP1, %xmm\index          # Round 2
281.endr
282.irpc index, \i_seq
283	movaps 0x80(%arg1), \TMP1
284	AESENC     \TMP1, %xmm\index          # Round 2
285.endr
286.irpc index, \i_seq
287	movaps 0x90(%arg1), \TMP1
288	AESENC     \TMP1, %xmm\index          # Round 2
289.endr
290.irpc index, \i_seq
291	movaps 0xa0(%arg1), \TMP1
292	AESENCLAST \TMP1, %xmm\index         # Round 10
293.endr
294.irpc index, \i_seq
295	movdqu	   (%arg3 , %r11, 1), \TMP1
296	pxor	   \TMP1, %xmm\index
297	movdqu	   %xmm\index, (%arg2 , %r11, 1)
298	# write back plaintext/ciphertext for num_initial_blocks
299	add	   $16, %r11
300
301	movdqa     \TMP1, %xmm\index
302        movdqa     SHUF_MASK(%rip), %xmm14
303	PSHUFB_XMM	   %xmm14, %xmm\index
304
305		# prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309        # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312        pxor       %xmm5, %xmm6
313	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314        pxor       %xmm6, %xmm7
315	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316        pxor       %xmm7, %xmm8
317	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319        pxor       %xmm6, %xmm7
320	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321        pxor       %xmm7, %xmm8
322	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324        pxor       %xmm7, %xmm8
325	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327	cmp	   $64, %r13
328	jl	_initial_blocks_done\num_initial_blocks\operation
329	# no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335	paddd	   ONE(%rip), \XMM0              # INCR Y0
336	movdqa	   \XMM0, \XMM1
337        movdqa     SHUF_MASK(%rip), %xmm14
338	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
339
340	paddd	   ONE(%rip), \XMM0              # INCR Y0
341	movdqa	   \XMM0, \XMM2
342        movdqa     SHUF_MASK(%rip), %xmm14
343	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
344
345	paddd	   ONE(%rip), \XMM0              # INCR Y0
346	movdqa	   \XMM0, \XMM3
347        movdqa     SHUF_MASK(%rip), %xmm14
348	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
349
350	paddd	   ONE(%rip), \XMM0              # INCR Y0
351	movdqa	   \XMM0, \XMM4
352        movdqa     SHUF_MASK(%rip), %xmm14
353	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
354
355	pxor	   16*0(%arg1), \XMM1
356	pxor	   16*0(%arg1), \XMM2
357	pxor	   16*0(%arg1), \XMM3
358	pxor	   16*0(%arg1), \XMM4
359	movdqa	   \TMP3, \TMP5
360	pshufd	   $78, \TMP3, \TMP1
361	pxor	   \TMP3, \TMP1
362	movdqa	   \TMP1, HashKey_k(%rsp)
363	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365	movdqa	   \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367	pshufd	   $78, \TMP5, \TMP1
368	pxor	   \TMP5, \TMP1
369	movdqa	   \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371	movaps 0x10*\index(%arg1), \TMP1
372	AESENC	   \TMP1, \XMM1
373	AESENC	   \TMP1, \XMM2
374	AESENC	   \TMP1, \XMM3
375	AESENC	   \TMP1, \XMM4
376.endr
377	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379	movdqa	   \TMP5, HashKey_3(%rsp)
380	pshufd	   $78, \TMP5, \TMP1
381	pxor	   \TMP5, \TMP1
382	movdqa	   \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384	movaps 0x10*\index(%arg1), \TMP1
385	AESENC	   \TMP1, \XMM1
386	AESENC	   \TMP1, \XMM2
387	AESENC	   \TMP1, \XMM3
388	AESENC	   \TMP1, \XMM4
389.endr
390	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392	movdqa	   \TMP5, HashKey_4(%rsp)
393	pshufd	   $78, \TMP5, \TMP1
394	pxor	   \TMP5, \TMP1
395	movdqa	   \TMP1, HashKey_4_k(%rsp)
396	movaps 0xa0(%arg1), \TMP2
397	AESENCLAST \TMP2, \XMM1
398	AESENCLAST \TMP2, \XMM2
399	AESENCLAST \TMP2, \XMM3
400	AESENCLAST \TMP2, \XMM4
401	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
402	pxor	   \TMP1, \XMM1
403	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
404	movdqa     \TMP1, \XMM1
405	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
406	pxor	   \TMP1, \XMM2
407	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
408	movdqa     \TMP1, \XMM2
409	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
410	pxor	   \TMP1, \XMM3
411	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
412	movdqa     \TMP1, \XMM3
413	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
414	pxor	   \TMP1, \XMM4
415	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
416	movdqa     \TMP1, \XMM4
417	add	   $64, %r11
418        movdqa     SHUF_MASK(%rip), %xmm14
419	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420	pxor	   \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422        movdqa     SHUF_MASK(%rip), %xmm14
423	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424        movdqa     SHUF_MASK(%rip), %xmm14
425	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426        movdqa     SHUF_MASK(%rip), %xmm14
427	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448	mov	   arg7, %r10           # %r10 = AAD
449	mov	   arg8, %r12           # %r12 = aadLen
450	mov	   %r12, %r11
451	pxor	   %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453	movd	   (%r10), \TMP1
454	pslldq	   $12, \TMP1
455	psrldq	   $4, %xmm\i
456	pxor	   \TMP1, %xmm\i
457	add	   $4, %r10
458	sub	   $4, %r12
459	jne	   _get_AAD_loop\num_initial_blocks\operation
460	cmp	   $16, %r11
461	je	   _get_AAD_loop2_done\num_initial_blocks\operation
462	mov	   $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464	psrldq	   $4, %xmm\i
465	sub	   $4, %r12
466	cmp	   %r11, %r12
467	jne	   _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469        movdqa     SHUF_MASK(%rip), %xmm14
470	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
471
472	xor	   %r11, %r11 # initialise the data pointer offset as zero
473
474        # start AES for num_initial_blocks blocks
475
476	mov	   %arg5, %rax                      # %rax = *Y0
477	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
478        movdqa     SHUF_MASK(%rip), %xmm14
479	PSHUFB_XMM   %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483	paddd	   ONE(%rip), \XMM0                 # INCR Y0
484	movdqa	   \XMM0, %xmm\index
485        movdqa     SHUF_MASK(%rip), %xmm14
486	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490	pxor	   16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493	movaps 0x10(%rdi), \TMP1
494	AESENC     \TMP1, %xmm\index          # Round 1
495.endr
496.irpc index, \i_seq
497	movaps 0x20(%arg1), \TMP1
498	AESENC     \TMP1, %xmm\index          # Round 2
499.endr
500.irpc index, \i_seq
501	movaps 0x30(%arg1), \TMP1
502	AESENC     \TMP1, %xmm\index          # Round 2
503.endr
504.irpc index, \i_seq
505	movaps 0x40(%arg1), \TMP1
506	AESENC     \TMP1, %xmm\index          # Round 2
507.endr
508.irpc index, \i_seq
509	movaps 0x50(%arg1), \TMP1
510	AESENC     \TMP1, %xmm\index          # Round 2
511.endr
512.irpc index, \i_seq
513	movaps 0x60(%arg1), \TMP1
514	AESENC     \TMP1, %xmm\index          # Round 2
515.endr
516.irpc index, \i_seq
517	movaps 0x70(%arg1), \TMP1
518	AESENC     \TMP1, %xmm\index          # Round 2
519.endr
520.irpc index, \i_seq
521	movaps 0x80(%arg1), \TMP1
522	AESENC     \TMP1, %xmm\index          # Round 2
523.endr
524.irpc index, \i_seq
525	movaps 0x90(%arg1), \TMP1
526	AESENC     \TMP1, %xmm\index          # Round 2
527.endr
528.irpc index, \i_seq
529	movaps 0xa0(%arg1), \TMP1
530	AESENCLAST \TMP1, %xmm\index         # Round 10
531.endr
532.irpc index, \i_seq
533	movdqu	   (%arg3 , %r11, 1), \TMP1
534	pxor	   \TMP1, %xmm\index
535	movdqu	   %xmm\index, (%arg2 , %r11, 1)
536	# write back plaintext/ciphertext for num_initial_blocks
537	add	   $16, %r11
538
539        movdqa     SHUF_MASK(%rip), %xmm14
540	PSHUFB_XMM	   %xmm14, %xmm\index
541
542		# prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546        # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549        pxor       %xmm5, %xmm6
550	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551        pxor       %xmm6, %xmm7
552	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553        pxor       %xmm7, %xmm8
554	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556        pxor       %xmm6, %xmm7
557	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558        pxor       %xmm7, %xmm8
559	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561        pxor       %xmm7, %xmm8
562	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564	cmp	   $64, %r13
565	jl	_initial_blocks_done\num_initial_blocks\operation
566	# no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572	paddd	   ONE(%rip), \XMM0              # INCR Y0
573	movdqa	   \XMM0, \XMM1
574        movdqa     SHUF_MASK(%rip), %xmm14
575	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
576
577	paddd	   ONE(%rip), \XMM0              # INCR Y0
578	movdqa	   \XMM0, \XMM2
579        movdqa     SHUF_MASK(%rip), %xmm14
580	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
581
582	paddd	   ONE(%rip), \XMM0              # INCR Y0
583	movdqa	   \XMM0, \XMM3
584        movdqa     SHUF_MASK(%rip), %xmm14
585	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
586
587	paddd	   ONE(%rip), \XMM0              # INCR Y0
588	movdqa	   \XMM0, \XMM4
589        movdqa     SHUF_MASK(%rip), %xmm14
590	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
591
592	pxor	   16*0(%arg1), \XMM1
593	pxor	   16*0(%arg1), \XMM2
594	pxor	   16*0(%arg1), \XMM3
595	pxor	   16*0(%arg1), \XMM4
596	movdqa	   \TMP3, \TMP5
597	pshufd	   $78, \TMP3, \TMP1
598	pxor	   \TMP3, \TMP1
599	movdqa	   \TMP1, HashKey_k(%rsp)
600	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602	movdqa	   \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604	pshufd	   $78, \TMP5, \TMP1
605	pxor	   \TMP5, \TMP1
606	movdqa	   \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608	movaps 0x10*\index(%arg1), \TMP1
609	AESENC	   \TMP1, \XMM1
610	AESENC	   \TMP1, \XMM2
611	AESENC	   \TMP1, \XMM3
612	AESENC	   \TMP1, \XMM4
613.endr
614	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616	movdqa	   \TMP5, HashKey_3(%rsp)
617	pshufd	   $78, \TMP5, \TMP1
618	pxor	   \TMP5, \TMP1
619	movdqa	   \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621	movaps 0x10*\index(%arg1), \TMP1
622	AESENC	   \TMP1, \XMM1
623	AESENC	   \TMP1, \XMM2
624	AESENC	   \TMP1, \XMM3
625	AESENC	   \TMP1, \XMM4
626.endr
627	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629	movdqa	   \TMP5, HashKey_4(%rsp)
630	pshufd	   $78, \TMP5, \TMP1
631	pxor	   \TMP5, \TMP1
632	movdqa	   \TMP1, HashKey_4_k(%rsp)
633	movaps 0xa0(%arg1), \TMP2
634	AESENCLAST \TMP2, \XMM1
635	AESENCLAST \TMP2, \XMM2
636	AESENCLAST \TMP2, \XMM3
637	AESENCLAST \TMP2, \XMM4
638	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
639	pxor	   \TMP1, \XMM1
640	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
641	pxor	   \TMP1, \XMM2
642	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
643	pxor	   \TMP1, \XMM3
644	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
645	pxor	   \TMP1, \XMM4
646	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
647	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
648	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
649	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
650
651	add	   $64, %r11
652        movdqa     SHUF_MASK(%rip), %xmm14
653	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654	pxor	   \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656        movdqa     SHUF_MASK(%rip), %xmm14
657	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658        movdqa     SHUF_MASK(%rip), %xmm14
659	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660        movdqa     SHUF_MASK(%rip), %xmm14
661	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676	movdqa	  \XMM1, \XMM5
677	movdqa	  \XMM2, \XMM6
678	movdqa	  \XMM3, \XMM7
679	movdqa	  \XMM4, \XMM8
680
681        movdqa    SHUF_MASK(%rip), %xmm15
682        # multiply TMP5 * HashKey using karatsuba
683
684	movdqa	  \XMM5, \TMP4
685	pshufd	  $78, \XMM5, \TMP6
686	pxor	  \XMM5, \TMP6
687	paddd     ONE(%rip), \XMM0		# INCR CNT
688	movdqa	  HashKey_4(%rsp), \TMP5
689	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
690	movdqa    \XMM0, \XMM1
691	paddd     ONE(%rip), \XMM0		# INCR CNT
692	movdqa    \XMM0, \XMM2
693	paddd     ONE(%rip), \XMM0		# INCR CNT
694	movdqa    \XMM0, \XMM3
695	paddd     ONE(%rip), \XMM0		# INCR CNT
696	movdqa    \XMM0, \XMM4
697	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
698	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
699	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
700	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
701	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
702
703	pxor	  (%arg1), \XMM1
704	pxor	  (%arg1), \XMM2
705	pxor	  (%arg1), \XMM3
706	pxor	  (%arg1), \XMM4
707	movdqa	  HashKey_4_k(%rsp), \TMP5
708	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
709	movaps 0x10(%arg1), \TMP1
710	AESENC	  \TMP1, \XMM1              # Round 1
711	AESENC	  \TMP1, \XMM2
712	AESENC	  \TMP1, \XMM3
713	AESENC	  \TMP1, \XMM4
714	movaps 0x20(%arg1), \TMP1
715	AESENC	  \TMP1, \XMM1              # Round 2
716	AESENC	  \TMP1, \XMM2
717	AESENC	  \TMP1, \XMM3
718	AESENC	  \TMP1, \XMM4
719	movdqa	  \XMM6, \TMP1
720	pshufd	  $78, \XMM6, \TMP2
721	pxor	  \XMM6, \TMP2
722	movdqa	  HashKey_3(%rsp), \TMP5
723	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
724	movaps 0x30(%arg1), \TMP3
725	AESENC    \TMP3, \XMM1              # Round 3
726	AESENC    \TMP3, \XMM2
727	AESENC    \TMP3, \XMM3
728	AESENC    \TMP3, \XMM4
729	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
730	movaps 0x40(%arg1), \TMP3
731	AESENC	  \TMP3, \XMM1              # Round 4
732	AESENC	  \TMP3, \XMM2
733	AESENC	  \TMP3, \XMM3
734	AESENC	  \TMP3, \XMM4
735	movdqa	  HashKey_3_k(%rsp), \TMP5
736	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
737	movaps 0x50(%arg1), \TMP3
738	AESENC	  \TMP3, \XMM1              # Round 5
739	AESENC	  \TMP3, \XMM2
740	AESENC	  \TMP3, \XMM3
741	AESENC	  \TMP3, \XMM4
742	pxor	  \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744	pxor	  \XMM6, \XMM5
745	pxor	  \TMP2, \TMP6
746	movdqa	  \XMM7, \TMP1
747	pshufd	  $78, \XMM7, \TMP2
748	pxor	  \XMM7, \TMP2
749	movdqa	  HashKey_2(%rsp ), \TMP5
750
751        # Multiply TMP5 * HashKey using karatsuba
752
753	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
754	movaps 0x60(%arg1), \TMP3
755	AESENC	  \TMP3, \XMM1              # Round 6
756	AESENC	  \TMP3, \XMM2
757	AESENC	  \TMP3, \XMM3
758	AESENC	  \TMP3, \XMM4
759	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
760	movaps 0x70(%arg1), \TMP3
761	AESENC	  \TMP3, \XMM1             # Round 7
762	AESENC	  \TMP3, \XMM2
763	AESENC	  \TMP3, \XMM3
764	AESENC	  \TMP3, \XMM4
765	movdqa	  HashKey_2_k(%rsp), \TMP5
766	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
767	movaps 0x80(%arg1), \TMP3
768	AESENC	  \TMP3, \XMM1             # Round 8
769	AESENC	  \TMP3, \XMM2
770	AESENC	  \TMP3, \XMM3
771	AESENC	  \TMP3, \XMM4
772	pxor	  \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774	pxor	  \XMM7, \XMM5
775	pxor	  \TMP2, \TMP6
776
777        # Multiply XMM8 * HashKey
778        # XMM8 and TMP5 hold the values for the two operands
779
780	movdqa	  \XMM8, \TMP1
781	pshufd	  $78, \XMM8, \TMP2
782	pxor	  \XMM8, \TMP2
783	movdqa	  HashKey(%rsp), \TMP5
784	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
785	movaps 0x90(%arg1), \TMP3
786	AESENC	  \TMP3, \XMM1            # Round 9
787	AESENC	  \TMP3, \XMM2
788	AESENC	  \TMP3, \XMM3
789	AESENC	  \TMP3, \XMM4
790	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
791	movaps 0xa0(%arg1), \TMP3
792	AESENCLAST \TMP3, \XMM1           # Round 10
793	AESENCLAST \TMP3, \XMM2
794	AESENCLAST \TMP3, \XMM3
795	AESENCLAST \TMP3, \XMM4
796	movdqa    HashKey_k(%rsp), \TMP5
797	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
798	movdqu	  (%arg3,%r11,1), \TMP3
799	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
800	movdqu	  16(%arg3,%r11,1), \TMP3
801	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
802	movdqu	  32(%arg3,%r11,1), \TMP3
803	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
804	movdqu	  48(%arg3,%r11,1), \TMP3
805	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
806        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
807        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
808        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
809        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
810	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
811	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
812	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
813	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
814
815	pxor	  \TMP4, \TMP1
816	pxor	  \XMM8, \XMM5
817	pxor	  \TMP6, \TMP2
818	pxor	  \TMP1, \TMP2
819	pxor	  \XMM5, \TMP2
820	movdqa	  \TMP2, \TMP3
821	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
822	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
823	pxor	  \TMP3, \XMM5
824	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
825
826        # first phase of reduction
827
828	movdqa    \XMM5, \TMP2
829	movdqa    \XMM5, \TMP3
830	movdqa    \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832	pslld     $31, \TMP2                   # packed right shift << 31
833	pslld     $30, \TMP3                   # packed right shift << 30
834	pslld     $25, \TMP4                   # packed right shift << 25
835	pxor      \TMP3, \TMP2	               # xor the shifted versions
836	pxor      \TMP4, \TMP2
837	movdqa    \TMP2, \TMP5
838	psrldq    $4, \TMP5                    # right shift T5 1 DW
839	pslldq    $12, \TMP2                   # left shift T2 3 DWs
840	pxor      \TMP2, \XMM5
841
842        # second phase of reduction
843
844	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845	movdqa    \XMM5,\TMP3
846	movdqa    \XMM5,\TMP4
847	psrld     $1, \TMP2                    # packed left shift >>1
848	psrld     $2, \TMP3                    # packed left shift >>2
849	psrld     $7, \TMP4                    # packed left shift >>7
850	pxor      \TMP3,\TMP2		       # xor the shifted versions
851	pxor      \TMP4,\TMP2
852	pxor      \TMP5, \TMP2
853	pxor      \TMP2, \XMM5
854	pxor      \TMP1, \XMM5                 # result is in TMP1
855
856	pxor	  \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868	movdqa	  \XMM1, \XMM5
869	movdqa	  \XMM2, \XMM6
870	movdqa	  \XMM3, \XMM7
871	movdqa	  \XMM4, \XMM8
872
873        movdqa    SHUF_MASK(%rip), %xmm15
874        # multiply TMP5 * HashKey using karatsuba
875
876	movdqa	  \XMM5, \TMP4
877	pshufd	  $78, \XMM5, \TMP6
878	pxor	  \XMM5, \TMP6
879	paddd     ONE(%rip), \XMM0		# INCR CNT
880	movdqa	  HashKey_4(%rsp), \TMP5
881	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
882	movdqa    \XMM0, \XMM1
883	paddd     ONE(%rip), \XMM0		# INCR CNT
884	movdqa    \XMM0, \XMM2
885	paddd     ONE(%rip), \XMM0		# INCR CNT
886	movdqa    \XMM0, \XMM3
887	paddd     ONE(%rip), \XMM0		# INCR CNT
888	movdqa    \XMM0, \XMM4
889	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
890	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
891	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
892	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
893	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
894
895	pxor	  (%arg1), \XMM1
896	pxor	  (%arg1), \XMM2
897	pxor	  (%arg1), \XMM3
898	pxor	  (%arg1), \XMM4
899	movdqa	  HashKey_4_k(%rsp), \TMP5
900	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
901	movaps 0x10(%arg1), \TMP1
902	AESENC	  \TMP1, \XMM1              # Round 1
903	AESENC	  \TMP1, \XMM2
904	AESENC	  \TMP1, \XMM3
905	AESENC	  \TMP1, \XMM4
906	movaps 0x20(%arg1), \TMP1
907	AESENC	  \TMP1, \XMM1              # Round 2
908	AESENC	  \TMP1, \XMM2
909	AESENC	  \TMP1, \XMM3
910	AESENC	  \TMP1, \XMM4
911	movdqa	  \XMM6, \TMP1
912	pshufd	  $78, \XMM6, \TMP2
913	pxor	  \XMM6, \TMP2
914	movdqa	  HashKey_3(%rsp), \TMP5
915	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
916	movaps 0x30(%arg1), \TMP3
917	AESENC    \TMP3, \XMM1              # Round 3
918	AESENC    \TMP3, \XMM2
919	AESENC    \TMP3, \XMM3
920	AESENC    \TMP3, \XMM4
921	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
922	movaps 0x40(%arg1), \TMP3
923	AESENC	  \TMP3, \XMM1              # Round 4
924	AESENC	  \TMP3, \XMM2
925	AESENC	  \TMP3, \XMM3
926	AESENC	  \TMP3, \XMM4
927	movdqa	  HashKey_3_k(%rsp), \TMP5
928	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
929	movaps 0x50(%arg1), \TMP3
930	AESENC	  \TMP3, \XMM1              # Round 5
931	AESENC	  \TMP3, \XMM2
932	AESENC	  \TMP3, \XMM3
933	AESENC	  \TMP3, \XMM4
934	pxor	  \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936	pxor	  \XMM6, \XMM5
937	pxor	  \TMP2, \TMP6
938	movdqa	  \XMM7, \TMP1
939	pshufd	  $78, \XMM7, \TMP2
940	pxor	  \XMM7, \TMP2
941	movdqa	  HashKey_2(%rsp ), \TMP5
942
943        # Multiply TMP5 * HashKey using karatsuba
944
945	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
946	movaps 0x60(%arg1), \TMP3
947	AESENC	  \TMP3, \XMM1              # Round 6
948	AESENC	  \TMP3, \XMM2
949	AESENC	  \TMP3, \XMM3
950	AESENC	  \TMP3, \XMM4
951	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
952	movaps 0x70(%arg1), \TMP3
953	AESENC	  \TMP3, \XMM1             # Round 7
954	AESENC	  \TMP3, \XMM2
955	AESENC	  \TMP3, \XMM3
956	AESENC	  \TMP3, \XMM4
957	movdqa	  HashKey_2_k(%rsp), \TMP5
958	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
959	movaps 0x80(%arg1), \TMP3
960	AESENC	  \TMP3, \XMM1             # Round 8
961	AESENC	  \TMP3, \XMM2
962	AESENC	  \TMP3, \XMM3
963	AESENC	  \TMP3, \XMM4
964	pxor	  \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966	pxor	  \XMM7, \XMM5
967	pxor	  \TMP2, \TMP6
968
969        # Multiply XMM8 * HashKey
970        # XMM8 and TMP5 hold the values for the two operands
971
972	movdqa	  \XMM8, \TMP1
973	pshufd	  $78, \XMM8, \TMP2
974	pxor	  \XMM8, \TMP2
975	movdqa	  HashKey(%rsp), \TMP5
976	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
977	movaps 0x90(%arg1), \TMP3
978	AESENC	  \TMP3, \XMM1            # Round 9
979	AESENC	  \TMP3, \XMM2
980	AESENC	  \TMP3, \XMM3
981	AESENC	  \TMP3, \XMM4
982	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
983	movaps 0xa0(%arg1), \TMP3
984	AESENCLAST \TMP3, \XMM1           # Round 10
985	AESENCLAST \TMP3, \XMM2
986	AESENCLAST \TMP3, \XMM3
987	AESENCLAST \TMP3, \XMM4
988	movdqa    HashKey_k(%rsp), \TMP5
989	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
990	movdqu	  (%arg3,%r11,1), \TMP3
991	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
992	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
993	movdqa    \TMP3, \XMM1
994	movdqu	  16(%arg3,%r11,1), \TMP3
995	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
996	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
997	movdqa    \TMP3, \XMM2
998	movdqu	  32(%arg3,%r11,1), \TMP3
999	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1000	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1001	movdqa    \TMP3, \XMM3
1002	movdqu	  48(%arg3,%r11,1), \TMP3
1003	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1004	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1005	movdqa    \TMP3, \XMM4
1006	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1007	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1008	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1009	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1010
1011	pxor	  \TMP4, \TMP1
1012	pxor	  \XMM8, \XMM5
1013	pxor	  \TMP6, \TMP2
1014	pxor	  \TMP1, \TMP2
1015	pxor	  \XMM5, \TMP2
1016	movdqa	  \TMP2, \TMP3
1017	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1018	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1019	pxor	  \TMP3, \XMM5
1020	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1021
1022        # first phase of reduction
1023
1024	movdqa    \XMM5, \TMP2
1025	movdqa    \XMM5, \TMP3
1026	movdqa    \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028	pslld     $31, \TMP2                   # packed right shift << 31
1029	pslld     $30, \TMP3                   # packed right shift << 30
1030	pslld     $25, \TMP4                   # packed right shift << 25
1031	pxor      \TMP3, \TMP2	               # xor the shifted versions
1032	pxor      \TMP4, \TMP2
1033	movdqa    \TMP2, \TMP5
1034	psrldq    $4, \TMP5                    # right shift T5 1 DW
1035	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1036	pxor      \TMP2, \XMM5
1037
1038        # second phase of reduction
1039
1040	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041	movdqa    \XMM5,\TMP3
1042	movdqa    \XMM5,\TMP4
1043	psrld     $1, \TMP2                    # packed left shift >>1
1044	psrld     $2, \TMP3                    # packed left shift >>2
1045	psrld     $7, \TMP4                    # packed left shift >>7
1046	pxor      \TMP3,\TMP2		       # xor the shifted versions
1047	pxor      \TMP4,\TMP2
1048	pxor      \TMP5, \TMP2
1049	pxor      \TMP2, \XMM5
1050	pxor      \TMP1, \XMM5                 # result is in TMP1
1051
1052	pxor	  \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059        # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061	movdqa	  \XMM1, \TMP6
1062	pshufd	  $78, \XMM1, \TMP2
1063	pxor	  \XMM1, \TMP2
1064	movdqa	  HashKey_4(%rsp), \TMP5
1065	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1066	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1067	movdqa	  HashKey_4_k(%rsp), \TMP4
1068	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1069	movdqa	  \XMM1, \XMMDst
1070	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1071
1072        # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074	movdqa	  \XMM2, \TMP1
1075	pshufd	  $78, \XMM2, \TMP2
1076	pxor	  \XMM2, \TMP2
1077	movdqa	  HashKey_3(%rsp), \TMP5
1078	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1079	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1080	movdqa	  HashKey_3_k(%rsp), \TMP4
1081	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1082	pxor	  \TMP1, \TMP6
1083	pxor	  \XMM2, \XMMDst
1084	pxor	  \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087        # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089	movdqa	  \XMM3, \TMP1
1090	pshufd	  $78, \XMM3, \TMP2
1091	pxor	  \XMM3, \TMP2
1092	movdqa	  HashKey_2(%rsp), \TMP5
1093	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1094	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1095	movdqa	  HashKey_2_k(%rsp), \TMP4
1096	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1097	pxor	  \TMP1, \TMP6
1098	pxor	  \XMM3, \XMMDst
1099	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1100
1101        # Multiply TMP1 * HashKey (using Karatsuba)
1102	movdqa	  \XMM4, \TMP1
1103	pshufd	  $78, \XMM4, \TMP2
1104	pxor	  \XMM4, \TMP2
1105	movdqa	  HashKey(%rsp), \TMP5
1106	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1107	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1108	movdqa	  HashKey_k(%rsp), \TMP4
1109	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1110	pxor	  \TMP1, \TMP6
1111	pxor	  \XMM4, \XMMDst
1112	pxor	  \XMM1, \TMP2
1113	pxor	  \TMP6, \TMP2
1114	pxor	  \XMMDst, \TMP2
1115	# middle section of the temp results combined as in karatsuba algorithm
1116	movdqa	  \TMP2, \TMP4
1117	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1118	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1119	pxor	  \TMP4, \XMMDst
1120	pxor	  \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122	# first phase of the reduction
1123	movdqa    \XMMDst, \TMP2
1124	movdqa    \XMMDst, \TMP3
1125	movdqa    \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127	pslld     $31, \TMP2                # packed right shifting << 31
1128	pslld     $30, \TMP3                # packed right shifting << 30
1129	pslld     $25, \TMP4                # packed right shifting << 25
1130	pxor      \TMP3, \TMP2              # xor the shifted versions
1131	pxor      \TMP4, \TMP2
1132	movdqa    \TMP2, \TMP7
1133	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1134	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1135	pxor      \TMP2, \XMMDst
1136
1137        # second phase of the reduction
1138	movdqa    \XMMDst, \TMP2
1139	# make 3 copies of XMMDst for doing 3 shift operations
1140	movdqa    \XMMDst, \TMP3
1141	movdqa    \XMMDst, \TMP4
1142	psrld     $1, \TMP2                 # packed left shift >> 1
1143	psrld     $2, \TMP3                 # packed left shift >> 2
1144	psrld     $7, \TMP4                 # packed left shift >> 7
1145	pxor      \TMP3, \TMP2              # xor the shifted versions
1146	pxor      \TMP4, \TMP2
1147	pxor      \TMP7, \TMP2
1148	pxor      \TMP2, \XMMDst
1149	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155	pxor	(%arg1), \XMM0
1156        movaps 16(%arg1), \TMP1
1157	AESENC	\TMP1, \XMM0
1158        movaps 32(%arg1), \TMP1
1159	AESENC	\TMP1, \XMM0
1160        movaps 48(%arg1), \TMP1
1161	AESENC	\TMP1, \XMM0
1162        movaps 64(%arg1), \TMP1
1163	AESENC	\TMP1, \XMM0
1164        movaps 80(%arg1), \TMP1
1165	AESENC	\TMP1, \XMM0
1166        movaps 96(%arg1), \TMP1
1167	AESENC	\TMP1, \XMM0
1168        movaps 112(%arg1), \TMP1
1169	AESENC	\TMP1, \XMM0
1170        movaps 128(%arg1), \TMP1
1171	AESENC	\TMP1, \XMM0
1172        movaps 144(%arg1), \TMP1
1173	AESENC	\TMP1, \XMM0
1174        movaps 160(%arg1), \TMP1
1175	AESENCLAST	\TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1181*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1182*                   const u8 *in,      // Ciphertext input
1183*                   u64 plaintext_len, // Length of data in bytes for decryption.
1184*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1185*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1187*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188*                   const u8 *aad,     // Additional Authentication Data (AAD)
1189*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1191*                                      // given authentication tag and only return the plaintext if they match.
1192*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193*                                      // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1199*       set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202*       0                   1                   2                   3
1203*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205*       |                             Salt  (From the SA)               |
1206*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207*       |                     Initialization Vector                     |
1208*       |         (This is the sequence number from IPSec header)       |
1209*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210*       |                              0x1                              |
1211*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216*       AAD padded to 128 bits with 0
1217*       for example, assume AAD is a u32 vector
1218*
1219*       if AAD is 8 bytes:
1220*       AAD[3] = {A0, A1};
1221*       padded AAD in xmm register = {A1 A0 0 0}
1222*
1223*       0                   1                   2                   3
1224*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226*       |                               SPI (A1)                        |
1227*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228*       |                     32-bit Sequence Number (A0)               |
1229*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230*       |                              0x0                              |
1231*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233*                                       AAD Format with 32-bit Sequence Number
1234*
1235*       if AAD is 12 bytes:
1236*       AAD[3] = {A0, A1, A2};
1237*       padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239*       0                   1                   2                   3
1240*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244*       |                               SPI (A2)                        |
1245*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246*       |                 64-bit Extended Sequence Number {A1,A0}       |
1247*       |                                                               |
1248*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249*       |                              0x0                              |
1250*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252*                        AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256*       The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260*       For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267	push	%r12
1268	push	%r13
1269	push	%r14
1270	mov	%rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275	sub	$VARIABLE_OFFSET, %rsp
1276	and	$~63, %rsp                        # align rsp to 64 bytes
1277	mov	%arg6, %r12
1278	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1279        movdqa  SHUF_MASK(%rip), %xmm2
1280	PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285	movdqa	%xmm13, %xmm2
1286	psllq	$1, %xmm13
1287	psrlq	$63, %xmm2
1288	movdqa	%xmm2, %xmm1
1289	pslldq	$8, %xmm2
1290	psrldq	$8, %xmm1
1291	por	%xmm2, %xmm13
1292
1293        # Reduction
1294
1295	pshufd	$0x24, %xmm1, %xmm2
1296	pcmpeqd TWOONE(%rip), %xmm2
1297	pand	POLY(%rip), %xmm2
1298	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301        # Decrypt first few blocks
1302
1303	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1304	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1305	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1306	mov %r13, %r12
1307	and $(3<<4), %r12
1308	jz _initial_num_blocks_is_0_decrypt
1309	cmp $(2<<4), %r12
1310	jb _initial_num_blocks_is_1_decrypt
1311	je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315	sub	$48, %r13
1316	jmp	_initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320	sub	$32, %r13
1321	jmp	_initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325	sub	$16, %r13
1326	jmp	_initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331	cmp	$0, %r13
1332	je	_zero_cipher_left_decrypt
1333	sub	$64, %r13
1334	je	_four_cipher_left_decrypt
1335_decrypt_by_4:
1336	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338	add	$64, %r11
1339	sub	$64, %r13
1340	jne	_decrypt_by_4
1341_four_cipher_left_decrypt:
1342	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345	mov	%arg4, %r13
1346	and	$15, %r13				# %r13 = arg4 (mod 16)
1347	je	_multiple_of_16_bytes_decrypt
1348
1349        # Handle the last <16 byte block separately
1350
1351	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1352        movdqa SHUF_MASK(%rip), %xmm10
1353	PSHUFB_XMM %xmm10, %xmm0
1354
1355	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1356	sub $16, %r11
1357	add %r13, %r11
1358	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1359	lea SHIFT_MASK+16(%rip), %r12
1360	sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1364	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1365
1366	movdqa  %xmm1, %xmm2
1367	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1368	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1371	pand    %xmm1, %xmm2
1372        movdqa SHUF_MASK(%rip), %xmm10
1373	PSHUFB_XMM %xmm10 ,%xmm2
1374
1375	pxor %xmm2, %xmm8
1376	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377	          # GHASH computation for the last <16 byte block
1378	sub %r13, %r11
1379	add $16, %r11
1380
1381        # output %r13 bytes
1382	MOVQ_R64_XMM	%xmm0, %rax
1383	cmp	$8, %r13
1384	jle	_less_than_8_bytes_left_decrypt
1385	mov	%rax, (%arg2 , %r11, 1)
1386	add	$8, %r11
1387	psrldq	$8, %xmm0
1388	MOVQ_R64_XMM	%xmm0, %rax
1389	sub	$8, %r13
1390_less_than_8_bytes_left_decrypt:
1391	mov	%al,  (%arg2, %r11, 1)
1392	add	$1, %r11
1393	shr	$8, %rax
1394	sub	$1, %r13
1395	jne	_less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1398	shl	$3, %r12		  # convert into number of bits
1399	movd	%r12d, %xmm15		  # len(A) in %xmm15
1400	shl	$3, %arg4		  # len(C) in bits (*128)
1401	MOVQ_R64_XMM	%arg4, %xmm1
1402	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1403	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1404	pxor	%xmm15, %xmm8
1405	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406	         # final GHASH computation
1407        movdqa SHUF_MASK(%rip), %xmm10
1408	PSHUFB_XMM %xmm10, %xmm8
1409
1410	mov	%arg5, %rax		  # %rax = *Y0
1411	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1412	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1413	pxor	%xmm8, %xmm0
1414_return_T_decrypt:
1415	mov	arg9, %r10                # %r10 = authTag
1416	mov	arg10, %r11               # %r11 = auth_tag_len
1417	cmp	$16, %r11
1418	je	_T_16_decrypt
1419	cmp	$12, %r11
1420	je	_T_12_decrypt
1421_T_8_decrypt:
1422	MOVQ_R64_XMM	%xmm0, %rax
1423	mov	%rax, (%r10)
1424	jmp	_return_T_done_decrypt
1425_T_12_decrypt:
1426	MOVQ_R64_XMM	%xmm0, %rax
1427	mov	%rax, (%r10)
1428	psrldq	$8, %xmm0
1429	movd	%xmm0, %eax
1430	mov	%eax, 8(%r10)
1431	jmp	_return_T_done_decrypt
1432_T_16_decrypt:
1433	movdqu	%xmm0, (%r10)
1434_return_T_done_decrypt:
1435	mov	%r14, %rsp
1436	pop	%r14
1437	pop	%r13
1438	pop	%r12
1439	ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1444*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1445*                    const u8 *in,       // Plaintext input
1446*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1447*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1448*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1450*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451*                    const u8 *aad,      // Additional Authentication Data (AAD)
1452*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453*                    u8 *auth_tag,       // Authenticated Tag output.
1454*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455*                                        // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460*       keys are pre-expanded and aligned to 16 bytes. we are using the
1461*       first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465*       0                   1                   2                   3
1466*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468*       |                             Salt  (From the SA)               |
1469*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470*       |                     Initialization Vector                     |
1471*       |         (This is the sequence number from IPSec header)       |
1472*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473*       |                              0x1                              |
1474*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479*       AAD padded to 128 bits with 0
1480*       for example, assume AAD is a u32 vector
1481*
1482*       if AAD is 8 bytes:
1483*       AAD[3] = {A0, A1};
1484*       padded AAD in xmm register = {A1 A0 0 0}
1485*
1486*       0                   1                   2                   3
1487*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489*       |                               SPI (A1)                        |
1490*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491*       |                     32-bit Sequence Number (A0)               |
1492*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493*       |                              0x0                              |
1494*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496*                                 AAD Format with 32-bit Sequence Number
1497*
1498*       if AAD is 12 bytes:
1499*       AAD[3] = {A0, A1, A2};
1500*       padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502*       0                   1                   2                   3
1503*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505*       |                               SPI (A2)                        |
1506*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507*       |                 64-bit Extended Sequence Number {A1,A0}       |
1508*       |                                                               |
1509*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510*       |                              0x0                              |
1511*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513*                         AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517*       The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521*       For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526	push	%r12
1527	push	%r13
1528	push	%r14
1529	mov	%rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534	sub	$VARIABLE_OFFSET, %rsp
1535	and	$~63, %rsp
1536	mov	%arg6, %r12
1537	movdqu	(%r12), %xmm13
1538        movdqa  SHUF_MASK(%rip), %xmm2
1539	PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544	movdqa	%xmm13, %xmm2
1545	psllq	$1, %xmm13
1546	psrlq	$63, %xmm2
1547	movdqa	%xmm2, %xmm1
1548	pslldq	$8, %xmm2
1549	psrldq	$8, %xmm1
1550	por	%xmm2, %xmm13
1551
1552        # reduce HashKey<<1
1553
1554	pshufd	$0x24, %xmm1, %xmm2
1555	pcmpeqd TWOONE(%rip), %xmm2
1556	pand	POLY(%rip), %xmm2
1557	pxor	%xmm2, %xmm13
1558	movdqa	%xmm13, HashKey(%rsp)
1559	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1560	and	$-16, %r13
1561	mov	%r13, %r12
1562
1563        # Encrypt first few blocks
1564
1565	and	$(3<<4), %r12
1566	jz	_initial_num_blocks_is_0_encrypt
1567	cmp	$(2<<4), %r12
1568	jb	_initial_num_blocks_is_1_encrypt
1569	je	_initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573	sub	$48, %r13
1574	jmp	_initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578	sub	$32, %r13
1579	jmp	_initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583	sub	$16, %r13
1584	jmp	_initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590        # Main loop - Encrypt remaining blocks
1591
1592	cmp	$0, %r13
1593	je	_zero_cipher_left_encrypt
1594	sub	$64, %r13
1595	je	_four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599	add	$64, %r11
1600	sub	$64, %r13
1601	jne	_encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606	mov	%arg4, %r13
1607	and	$15, %r13			# %r13 = arg4 (mod 16)
1608	je	_multiple_of_16_bytes_encrypt
1609
1610         # Handle the last <16 Byte block separately
1611	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1612        movdqa SHUF_MASK(%rip), %xmm10
1613	PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1617	sub $16, %r11
1618	add %r13, %r11
1619	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1620	lea SHIFT_MASK+16(%rip), %r12
1621	sub %r13, %r12
1622	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623	# (%r13 is the number of bytes in plaintext mod 16)
1624	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1625	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1626	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1627	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1628	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1630        movdqa SHUF_MASK(%rip), %xmm10
1631	PSHUFB_XMM %xmm10,%xmm0
1632
1633	pxor	%xmm0, %xmm8
1634	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635	# GHASH computation for the last <16 byte block
1636	sub	%r13, %r11
1637	add	$16, %r11
1638
1639	movdqa SHUF_MASK(%rip), %xmm10
1640	PSHUFB_XMM %xmm10, %xmm0
1641
1642	# shuffle xmm0 back to output as ciphertext
1643
1644        # Output %r13 bytes
1645	MOVQ_R64_XMM %xmm0, %rax
1646	cmp $8, %r13
1647	jle _less_than_8_bytes_left_encrypt
1648	mov %rax, (%arg2 , %r11, 1)
1649	add $8, %r11
1650	psrldq $8, %xmm0
1651	MOVQ_R64_XMM %xmm0, %rax
1652	sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654	mov %al,  (%arg2, %r11, 1)
1655	add $1, %r11
1656	shr $8, %rax
1657	sub $1, %r13
1658	jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1661	shl	$3, %r12
1662	movd	%r12d, %xmm15       # len(A) in %xmm15
1663	shl	$3, %arg4               # len(C) in bits (*128)
1664	MOVQ_R64_XMM	%arg4, %xmm1
1665	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1666	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1667	pxor	%xmm15, %xmm8
1668	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669	# final GHASH computation
1670        movdqa SHUF_MASK(%rip), %xmm10
1671	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1672
1673	mov	%arg5, %rax		       # %rax  = *Y0
1674	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1675	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1676	pxor	%xmm8, %xmm0
1677_return_T_encrypt:
1678	mov	arg9, %r10                     # %r10 = authTag
1679	mov	arg10, %r11                    # %r11 = auth_tag_len
1680	cmp	$16, %r11
1681	je	_T_16_encrypt
1682	cmp	$12, %r11
1683	je	_T_12_encrypt
1684_T_8_encrypt:
1685	MOVQ_R64_XMM	%xmm0, %rax
1686	mov	%rax, (%r10)
1687	jmp	_return_T_done_encrypt
1688_T_12_encrypt:
1689	MOVQ_R64_XMM	%xmm0, %rax
1690	mov	%rax, (%r10)
1691	psrldq	$8, %xmm0
1692	movd	%xmm0, %eax
1693	mov	%eax, 8(%r10)
1694	jmp	_return_T_done_encrypt
1695_T_16_encrypt:
1696	movdqu	%xmm0, (%r10)
1697_return_T_done_encrypt:
1698	mov	%r14, %rsp
1699	pop	%r14
1700	pop	%r13
1701	pop	%r12
1702	ret
1703
1704#endif
1705
1706
1707_key_expansion_128:
1708_key_expansion_256a:
1709	pshufd $0b11111111, %xmm1, %xmm1
1710	shufps $0b00010000, %xmm0, %xmm4
1711	pxor %xmm4, %xmm0
1712	shufps $0b10001100, %xmm0, %xmm4
1713	pxor %xmm4, %xmm0
1714	pxor %xmm1, %xmm0
1715	movaps %xmm0, (TKEYP)
1716	add $0x10, TKEYP
1717	ret
1718
1719.align 4
1720_key_expansion_192a:
1721	pshufd $0b01010101, %xmm1, %xmm1
1722	shufps $0b00010000, %xmm0, %xmm4
1723	pxor %xmm4, %xmm0
1724	shufps $0b10001100, %xmm0, %xmm4
1725	pxor %xmm4, %xmm0
1726	pxor %xmm1, %xmm0
1727
1728	movaps %xmm2, %xmm5
1729	movaps %xmm2, %xmm6
1730	pslldq $4, %xmm5
1731	pshufd $0b11111111, %xmm0, %xmm3
1732	pxor %xmm3, %xmm2
1733	pxor %xmm5, %xmm2
1734
1735	movaps %xmm0, %xmm1
1736	shufps $0b01000100, %xmm0, %xmm6
1737	movaps %xmm6, (TKEYP)
1738	shufps $0b01001110, %xmm2, %xmm1
1739	movaps %xmm1, 0x10(TKEYP)
1740	add $0x20, TKEYP
1741	ret
1742
1743.align 4
1744_key_expansion_192b:
1745	pshufd $0b01010101, %xmm1, %xmm1
1746	shufps $0b00010000, %xmm0, %xmm4
1747	pxor %xmm4, %xmm0
1748	shufps $0b10001100, %xmm0, %xmm4
1749	pxor %xmm4, %xmm0
1750	pxor %xmm1, %xmm0
1751
1752	movaps %xmm2, %xmm5
1753	pslldq $4, %xmm5
1754	pshufd $0b11111111, %xmm0, %xmm3
1755	pxor %xmm3, %xmm2
1756	pxor %xmm5, %xmm2
1757
1758	movaps %xmm0, (TKEYP)
1759	add $0x10, TKEYP
1760	ret
1761
1762.align 4
1763_key_expansion_256b:
1764	pshufd $0b10101010, %xmm1, %xmm1
1765	shufps $0b00010000, %xmm2, %xmm4
1766	pxor %xmm4, %xmm2
1767	shufps $0b10001100, %xmm2, %xmm4
1768	pxor %xmm4, %xmm2
1769	pxor %xmm1, %xmm2
1770	movaps %xmm2, (TKEYP)
1771	add $0x10, TKEYP
1772	ret
1773
1774/*
1775 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776 *                   unsigned int key_len)
1777 */
1778ENTRY(aesni_set_key)
1779#ifndef __x86_64__
1780	pushl KEYP
1781	movl 8(%esp), KEYP		# ctx
1782	movl 12(%esp), UKEYP		# in_key
1783	movl 16(%esp), %edx		# key_len
1784#endif
1785	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1786	movaps %xmm0, (KEYP)
1787	lea 0x10(KEYP), TKEYP		# key addr
1788	movl %edx, 480(KEYP)
1789	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1790	cmp $24, %dl
1791	jb .Lenc_key128
1792	je .Lenc_key192
1793	movups 0x10(UKEYP), %xmm2	# other user key
1794	movaps %xmm2, (TKEYP)
1795	add $0x10, TKEYP
1796	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1797	call _key_expansion_256a
1798	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799	call _key_expansion_256b
1800	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1801	call _key_expansion_256a
1802	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803	call _key_expansion_256b
1804	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1805	call _key_expansion_256a
1806	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807	call _key_expansion_256b
1808	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1809	call _key_expansion_256a
1810	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811	call _key_expansion_256b
1812	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1813	call _key_expansion_256a
1814	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815	call _key_expansion_256b
1816	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1817	call _key_expansion_256a
1818	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819	call _key_expansion_256b
1820	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1821	call _key_expansion_256a
1822	jmp .Ldec_key
1823.Lenc_key192:
1824	movq 0x10(UKEYP), %xmm2		# other user key
1825	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1826	call _key_expansion_192a
1827	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1828	call _key_expansion_192b
1829	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1830	call _key_expansion_192a
1831	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1832	call _key_expansion_192b
1833	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1834	call _key_expansion_192a
1835	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1836	call _key_expansion_192b
1837	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1838	call _key_expansion_192a
1839	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1840	call _key_expansion_192b
1841	jmp .Ldec_key
1842.Lenc_key128:
1843	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1844	call _key_expansion_128
1845	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1846	call _key_expansion_128
1847	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1848	call _key_expansion_128
1849	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1850	call _key_expansion_128
1851	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1852	call _key_expansion_128
1853	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1854	call _key_expansion_128
1855	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1856	call _key_expansion_128
1857	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1858	call _key_expansion_128
1859	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1860	call _key_expansion_128
1861	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1862	call _key_expansion_128
1863.Ldec_key:
1864	sub $0x10, TKEYP
1865	movaps (KEYP), %xmm0
1866	movaps (TKEYP), %xmm1
1867	movaps %xmm0, 240(TKEYP)
1868	movaps %xmm1, 240(KEYP)
1869	add $0x10, KEYP
1870	lea 240-16(TKEYP), UKEYP
1871.align 4
1872.Ldec_key_loop:
1873	movaps (KEYP), %xmm0
1874	AESIMC %xmm0 %xmm1
1875	movaps %xmm1, (UKEYP)
1876	add $0x10, KEYP
1877	sub $0x10, UKEYP
1878	cmp TKEYP, KEYP
1879	jb .Ldec_key_loop
1880	xor AREG, AREG
1881#ifndef __x86_64__
1882	popl KEYP
1883#endif
1884	ret
1885
1886/*
1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888 */
1889ENTRY(aesni_enc)
1890#ifndef __x86_64__
1891	pushl KEYP
1892	pushl KLEN
1893	movl 12(%esp), KEYP
1894	movl 16(%esp), OUTP
1895	movl 20(%esp), INP
1896#endif
1897	movl 480(KEYP), KLEN		# key length
1898	movups (INP), STATE		# input
1899	call _aesni_enc1
1900	movups STATE, (OUTP)		# output
1901#ifndef __x86_64__
1902	popl KLEN
1903	popl KEYP
1904#endif
1905	ret
1906
1907/*
1908 * _aesni_enc1:		internal ABI
1909 * input:
1910 *	KEYP:		key struct pointer
1911 *	KLEN:		round count
1912 *	STATE:		initial state (input)
1913 * output:
1914 *	STATE:		finial state (output)
1915 * changed:
1916 *	KEY
1917 *	TKEYP (T1)
1918 */
1919.align 4
1920_aesni_enc1:
1921	movaps (KEYP), KEY		# key
1922	mov KEYP, TKEYP
1923	pxor KEY, STATE		# round 0
1924	add $0x30, TKEYP
1925	cmp $24, KLEN
1926	jb .Lenc128
1927	lea 0x20(TKEYP), TKEYP
1928	je .Lenc192
1929	add $0x20, TKEYP
1930	movaps -0x60(TKEYP), KEY
1931	AESENC KEY STATE
1932	movaps -0x50(TKEYP), KEY
1933	AESENC KEY STATE
1934.align 4
1935.Lenc192:
1936	movaps -0x40(TKEYP), KEY
1937	AESENC KEY STATE
1938	movaps -0x30(TKEYP), KEY
1939	AESENC KEY STATE
1940.align 4
1941.Lenc128:
1942	movaps -0x20(TKEYP), KEY
1943	AESENC KEY STATE
1944	movaps -0x10(TKEYP), KEY
1945	AESENC KEY STATE
1946	movaps (TKEYP), KEY
1947	AESENC KEY STATE
1948	movaps 0x10(TKEYP), KEY
1949	AESENC KEY STATE
1950	movaps 0x20(TKEYP), KEY
1951	AESENC KEY STATE
1952	movaps 0x30(TKEYP), KEY
1953	AESENC KEY STATE
1954	movaps 0x40(TKEYP), KEY
1955	AESENC KEY STATE
1956	movaps 0x50(TKEYP), KEY
1957	AESENC KEY STATE
1958	movaps 0x60(TKEYP), KEY
1959	AESENC KEY STATE
1960	movaps 0x70(TKEYP), KEY
1961	AESENCLAST KEY STATE
1962	ret
1963
1964/*
1965 * _aesni_enc4:	internal ABI
1966 * input:
1967 *	KEYP:		key struct pointer
1968 *	KLEN:		round count
1969 *	STATE1:		initial state (input)
1970 *	STATE2
1971 *	STATE3
1972 *	STATE4
1973 * output:
1974 *	STATE1:		finial state (output)
1975 *	STATE2
1976 *	STATE3
1977 *	STATE4
1978 * changed:
1979 *	KEY
1980 *	TKEYP (T1)
1981 */
1982.align 4
1983_aesni_enc4:
1984	movaps (KEYP), KEY		# key
1985	mov KEYP, TKEYP
1986	pxor KEY, STATE1		# round 0
1987	pxor KEY, STATE2
1988	pxor KEY, STATE3
1989	pxor KEY, STATE4
1990	add $0x30, TKEYP
1991	cmp $24, KLEN
1992	jb .L4enc128
1993	lea 0x20(TKEYP), TKEYP
1994	je .L4enc192
1995	add $0x20, TKEYP
1996	movaps -0x60(TKEYP), KEY
1997	AESENC KEY STATE1
1998	AESENC KEY STATE2
1999	AESENC KEY STATE3
2000	AESENC KEY STATE4
2001	movaps -0x50(TKEYP), KEY
2002	AESENC KEY STATE1
2003	AESENC KEY STATE2
2004	AESENC KEY STATE3
2005	AESENC KEY STATE4
2006#.align 4
2007.L4enc192:
2008	movaps -0x40(TKEYP), KEY
2009	AESENC KEY STATE1
2010	AESENC KEY STATE2
2011	AESENC KEY STATE3
2012	AESENC KEY STATE4
2013	movaps -0x30(TKEYP), KEY
2014	AESENC KEY STATE1
2015	AESENC KEY STATE2
2016	AESENC KEY STATE3
2017	AESENC KEY STATE4
2018#.align 4
2019.L4enc128:
2020	movaps -0x20(TKEYP), KEY
2021	AESENC KEY STATE1
2022	AESENC KEY STATE2
2023	AESENC KEY STATE3
2024	AESENC KEY STATE4
2025	movaps -0x10(TKEYP), KEY
2026	AESENC KEY STATE1
2027	AESENC KEY STATE2
2028	AESENC KEY STATE3
2029	AESENC KEY STATE4
2030	movaps (TKEYP), KEY
2031	AESENC KEY STATE1
2032	AESENC KEY STATE2
2033	AESENC KEY STATE3
2034	AESENC KEY STATE4
2035	movaps 0x10(TKEYP), KEY
2036	AESENC KEY STATE1
2037	AESENC KEY STATE2
2038	AESENC KEY STATE3
2039	AESENC KEY STATE4
2040	movaps 0x20(TKEYP), KEY
2041	AESENC KEY STATE1
2042	AESENC KEY STATE2
2043	AESENC KEY STATE3
2044	AESENC KEY STATE4
2045	movaps 0x30(TKEYP), KEY
2046	AESENC KEY STATE1
2047	AESENC KEY STATE2
2048	AESENC KEY STATE3
2049	AESENC KEY STATE4
2050	movaps 0x40(TKEYP), KEY
2051	AESENC KEY STATE1
2052	AESENC KEY STATE2
2053	AESENC KEY STATE3
2054	AESENC KEY STATE4
2055	movaps 0x50(TKEYP), KEY
2056	AESENC KEY STATE1
2057	AESENC KEY STATE2
2058	AESENC KEY STATE3
2059	AESENC KEY STATE4
2060	movaps 0x60(TKEYP), KEY
2061	AESENC KEY STATE1
2062	AESENC KEY STATE2
2063	AESENC KEY STATE3
2064	AESENC KEY STATE4
2065	movaps 0x70(TKEYP), KEY
2066	AESENCLAST KEY STATE1		# last round
2067	AESENCLAST KEY STATE2
2068	AESENCLAST KEY STATE3
2069	AESENCLAST KEY STATE4
2070	ret
2071
2072/*
2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074 */
2075ENTRY(aesni_dec)
2076#ifndef __x86_64__
2077	pushl KEYP
2078	pushl KLEN
2079	movl 12(%esp), KEYP
2080	movl 16(%esp), OUTP
2081	movl 20(%esp), INP
2082#endif
2083	mov 480(KEYP), KLEN		# key length
2084	add $240, KEYP
2085	movups (INP), STATE		# input
2086	call _aesni_dec1
2087	movups STATE, (OUTP)		#output
2088#ifndef __x86_64__
2089	popl KLEN
2090	popl KEYP
2091#endif
2092	ret
2093
2094/*
2095 * _aesni_dec1:		internal ABI
2096 * input:
2097 *	KEYP:		key struct pointer
2098 *	KLEN:		key length
2099 *	STATE:		initial state (input)
2100 * output:
2101 *	STATE:		finial state (output)
2102 * changed:
2103 *	KEY
2104 *	TKEYP (T1)
2105 */
2106.align 4
2107_aesni_dec1:
2108	movaps (KEYP), KEY		# key
2109	mov KEYP, TKEYP
2110	pxor KEY, STATE		# round 0
2111	add $0x30, TKEYP
2112	cmp $24, KLEN
2113	jb .Ldec128
2114	lea 0x20(TKEYP), TKEYP
2115	je .Ldec192
2116	add $0x20, TKEYP
2117	movaps -0x60(TKEYP), KEY
2118	AESDEC KEY STATE
2119	movaps -0x50(TKEYP), KEY
2120	AESDEC KEY STATE
2121.align 4
2122.Ldec192:
2123	movaps -0x40(TKEYP), KEY
2124	AESDEC KEY STATE
2125	movaps -0x30(TKEYP), KEY
2126	AESDEC KEY STATE
2127.align 4
2128.Ldec128:
2129	movaps -0x20(TKEYP), KEY
2130	AESDEC KEY STATE
2131	movaps -0x10(TKEYP), KEY
2132	AESDEC KEY STATE
2133	movaps (TKEYP), KEY
2134	AESDEC KEY STATE
2135	movaps 0x10(TKEYP), KEY
2136	AESDEC KEY STATE
2137	movaps 0x20(TKEYP), KEY
2138	AESDEC KEY STATE
2139	movaps 0x30(TKEYP), KEY
2140	AESDEC KEY STATE
2141	movaps 0x40(TKEYP), KEY
2142	AESDEC KEY STATE
2143	movaps 0x50(TKEYP), KEY
2144	AESDEC KEY STATE
2145	movaps 0x60(TKEYP), KEY
2146	AESDEC KEY STATE
2147	movaps 0x70(TKEYP), KEY
2148	AESDECLAST KEY STATE
2149	ret
2150
2151/*
2152 * _aesni_dec4:	internal ABI
2153 * input:
2154 *	KEYP:		key struct pointer
2155 *	KLEN:		key length
2156 *	STATE1:		initial state (input)
2157 *	STATE2
2158 *	STATE3
2159 *	STATE4
2160 * output:
2161 *	STATE1:		finial state (output)
2162 *	STATE2
2163 *	STATE3
2164 *	STATE4
2165 * changed:
2166 *	KEY
2167 *	TKEYP (T1)
2168 */
2169.align 4
2170_aesni_dec4:
2171	movaps (KEYP), KEY		# key
2172	mov KEYP, TKEYP
2173	pxor KEY, STATE1		# round 0
2174	pxor KEY, STATE2
2175	pxor KEY, STATE3
2176	pxor KEY, STATE4
2177	add $0x30, TKEYP
2178	cmp $24, KLEN
2179	jb .L4dec128
2180	lea 0x20(TKEYP), TKEYP
2181	je .L4dec192
2182	add $0x20, TKEYP
2183	movaps -0x60(TKEYP), KEY
2184	AESDEC KEY STATE1
2185	AESDEC KEY STATE2
2186	AESDEC KEY STATE3
2187	AESDEC KEY STATE4
2188	movaps -0x50(TKEYP), KEY
2189	AESDEC KEY STATE1
2190	AESDEC KEY STATE2
2191	AESDEC KEY STATE3
2192	AESDEC KEY STATE4
2193.align 4
2194.L4dec192:
2195	movaps -0x40(TKEYP), KEY
2196	AESDEC KEY STATE1
2197	AESDEC KEY STATE2
2198	AESDEC KEY STATE3
2199	AESDEC KEY STATE4
2200	movaps -0x30(TKEYP), KEY
2201	AESDEC KEY STATE1
2202	AESDEC KEY STATE2
2203	AESDEC KEY STATE3
2204	AESDEC KEY STATE4
2205.align 4
2206.L4dec128:
2207	movaps -0x20(TKEYP), KEY
2208	AESDEC KEY STATE1
2209	AESDEC KEY STATE2
2210	AESDEC KEY STATE3
2211	AESDEC KEY STATE4
2212	movaps -0x10(TKEYP), KEY
2213	AESDEC KEY STATE1
2214	AESDEC KEY STATE2
2215	AESDEC KEY STATE3
2216	AESDEC KEY STATE4
2217	movaps (TKEYP), KEY
2218	AESDEC KEY STATE1
2219	AESDEC KEY STATE2
2220	AESDEC KEY STATE3
2221	AESDEC KEY STATE4
2222	movaps 0x10(TKEYP), KEY
2223	AESDEC KEY STATE1
2224	AESDEC KEY STATE2
2225	AESDEC KEY STATE3
2226	AESDEC KEY STATE4
2227	movaps 0x20(TKEYP), KEY
2228	AESDEC KEY STATE1
2229	AESDEC KEY STATE2
2230	AESDEC KEY STATE3
2231	AESDEC KEY STATE4
2232	movaps 0x30(TKEYP), KEY
2233	AESDEC KEY STATE1
2234	AESDEC KEY STATE2
2235	AESDEC KEY STATE3
2236	AESDEC KEY STATE4
2237	movaps 0x40(TKEYP), KEY
2238	AESDEC KEY STATE1
2239	AESDEC KEY STATE2
2240	AESDEC KEY STATE3
2241	AESDEC KEY STATE4
2242	movaps 0x50(TKEYP), KEY
2243	AESDEC KEY STATE1
2244	AESDEC KEY STATE2
2245	AESDEC KEY STATE3
2246	AESDEC KEY STATE4
2247	movaps 0x60(TKEYP), KEY
2248	AESDEC KEY STATE1
2249	AESDEC KEY STATE2
2250	AESDEC KEY STATE3
2251	AESDEC KEY STATE4
2252	movaps 0x70(TKEYP), KEY
2253	AESDECLAST KEY STATE1		# last round
2254	AESDECLAST KEY STATE2
2255	AESDECLAST KEY STATE3
2256	AESDECLAST KEY STATE4
2257	ret
2258
2259/*
2260 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261 *		      size_t len)
2262 */
2263ENTRY(aesni_ecb_enc)
2264#ifndef __x86_64__
2265	pushl LEN
2266	pushl KEYP
2267	pushl KLEN
2268	movl 16(%esp), KEYP
2269	movl 20(%esp), OUTP
2270	movl 24(%esp), INP
2271	movl 28(%esp), LEN
2272#endif
2273	test LEN, LEN		# check length
2274	jz .Lecb_enc_ret
2275	mov 480(KEYP), KLEN
2276	cmp $16, LEN
2277	jb .Lecb_enc_ret
2278	cmp $64, LEN
2279	jb .Lecb_enc_loop1
2280.align 4
2281.Lecb_enc_loop4:
2282	movups (INP), STATE1
2283	movups 0x10(INP), STATE2
2284	movups 0x20(INP), STATE3
2285	movups 0x30(INP), STATE4
2286	call _aesni_enc4
2287	movups STATE1, (OUTP)
2288	movups STATE2, 0x10(OUTP)
2289	movups STATE3, 0x20(OUTP)
2290	movups STATE4, 0x30(OUTP)
2291	sub $64, LEN
2292	add $64, INP
2293	add $64, OUTP
2294	cmp $64, LEN
2295	jge .Lecb_enc_loop4
2296	cmp $16, LEN
2297	jb .Lecb_enc_ret
2298.align 4
2299.Lecb_enc_loop1:
2300	movups (INP), STATE1
2301	call _aesni_enc1
2302	movups STATE1, (OUTP)
2303	sub $16, LEN
2304	add $16, INP
2305	add $16, OUTP
2306	cmp $16, LEN
2307	jge .Lecb_enc_loop1
2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310	popl KLEN
2311	popl KEYP
2312	popl LEN
2313#endif
2314	ret
2315
2316/*
2317 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 *		      size_t len);
2319 */
2320ENTRY(aesni_ecb_dec)
2321#ifndef __x86_64__
2322	pushl LEN
2323	pushl KEYP
2324	pushl KLEN
2325	movl 16(%esp), KEYP
2326	movl 20(%esp), OUTP
2327	movl 24(%esp), INP
2328	movl 28(%esp), LEN
2329#endif
2330	test LEN, LEN
2331	jz .Lecb_dec_ret
2332	mov 480(KEYP), KLEN
2333	add $240, KEYP
2334	cmp $16, LEN
2335	jb .Lecb_dec_ret
2336	cmp $64, LEN
2337	jb .Lecb_dec_loop1
2338.align 4
2339.Lecb_dec_loop4:
2340	movups (INP), STATE1
2341	movups 0x10(INP), STATE2
2342	movups 0x20(INP), STATE3
2343	movups 0x30(INP), STATE4
2344	call _aesni_dec4
2345	movups STATE1, (OUTP)
2346	movups STATE2, 0x10(OUTP)
2347	movups STATE3, 0x20(OUTP)
2348	movups STATE4, 0x30(OUTP)
2349	sub $64, LEN
2350	add $64, INP
2351	add $64, OUTP
2352	cmp $64, LEN
2353	jge .Lecb_dec_loop4
2354	cmp $16, LEN
2355	jb .Lecb_dec_ret
2356.align 4
2357.Lecb_dec_loop1:
2358	movups (INP), STATE1
2359	call _aesni_dec1
2360	movups STATE1, (OUTP)
2361	sub $16, LEN
2362	add $16, INP
2363	add $16, OUTP
2364	cmp $16, LEN
2365	jge .Lecb_dec_loop1
2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368	popl KLEN
2369	popl KEYP
2370	popl LEN
2371#endif
2372	ret
2373
2374/*
2375 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376 *		      size_t len, u8 *iv)
2377 */
2378ENTRY(aesni_cbc_enc)
2379#ifndef __x86_64__
2380	pushl IVP
2381	pushl LEN
2382	pushl KEYP
2383	pushl KLEN
2384	movl 20(%esp), KEYP
2385	movl 24(%esp), OUTP
2386	movl 28(%esp), INP
2387	movl 32(%esp), LEN
2388	movl 36(%esp), IVP
2389#endif
2390	cmp $16, LEN
2391	jb .Lcbc_enc_ret
2392	mov 480(KEYP), KLEN
2393	movups (IVP), STATE	# load iv as initial state
2394.align 4
2395.Lcbc_enc_loop:
2396	movups (INP), IN	# load input
2397	pxor IN, STATE
2398	call _aesni_enc1
2399	movups STATE, (OUTP)	# store output
2400	sub $16, LEN
2401	add $16, INP
2402	add $16, OUTP
2403	cmp $16, LEN
2404	jge .Lcbc_enc_loop
2405	movups STATE, (IVP)
2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408	popl KLEN
2409	popl KEYP
2410	popl LEN
2411	popl IVP
2412#endif
2413	ret
2414
2415/*
2416 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417 *		      size_t len, u8 *iv)
2418 */
2419ENTRY(aesni_cbc_dec)
2420#ifndef __x86_64__
2421	pushl IVP
2422	pushl LEN
2423	pushl KEYP
2424	pushl KLEN
2425	movl 20(%esp), KEYP
2426	movl 24(%esp), OUTP
2427	movl 28(%esp), INP
2428	movl 32(%esp), LEN
2429	movl 36(%esp), IVP
2430#endif
2431	cmp $16, LEN
2432	jb .Lcbc_dec_just_ret
2433	mov 480(KEYP), KLEN
2434	add $240, KEYP
2435	movups (IVP), IV
2436	cmp $64, LEN
2437	jb .Lcbc_dec_loop1
2438.align 4
2439.Lcbc_dec_loop4:
2440	movups (INP), IN1
2441	movaps IN1, STATE1
2442	movups 0x10(INP), IN2
2443	movaps IN2, STATE2
2444#ifdef __x86_64__
2445	movups 0x20(INP), IN3
2446	movaps IN3, STATE3
2447	movups 0x30(INP), IN4
2448	movaps IN4, STATE4
2449#else
2450	movups 0x20(INP), IN1
2451	movaps IN1, STATE3
2452	movups 0x30(INP), IN2
2453	movaps IN2, STATE4
2454#endif
2455	call _aesni_dec4
2456	pxor IV, STATE1
2457#ifdef __x86_64__
2458	pxor IN1, STATE2
2459	pxor IN2, STATE3
2460	pxor IN3, STATE4
2461	movaps IN4, IV
2462#else
2463	pxor (INP), STATE2
2464	pxor 0x10(INP), STATE3
2465	pxor IN1, STATE4
2466	movaps IN2, IV
2467#endif
2468	movups STATE1, (OUTP)
2469	movups STATE2, 0x10(OUTP)
2470	movups STATE3, 0x20(OUTP)
2471	movups STATE4, 0x30(OUTP)
2472	sub $64, LEN
2473	add $64, INP
2474	add $64, OUTP
2475	cmp $64, LEN
2476	jge .Lcbc_dec_loop4
2477	cmp $16, LEN
2478	jb .Lcbc_dec_ret
2479.align 4
2480.Lcbc_dec_loop1:
2481	movups (INP), IN
2482	movaps IN, STATE
2483	call _aesni_dec1
2484	pxor IV, STATE
2485	movups STATE, (OUTP)
2486	movaps IN, IV
2487	sub $16, LEN
2488	add $16, INP
2489	add $16, OUTP
2490	cmp $16, LEN
2491	jge .Lcbc_dec_loop1
2492.Lcbc_dec_ret:
2493	movups IV, (IVP)
2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496	popl KLEN
2497	popl KEYP
2498	popl LEN
2499	popl IVP
2500#endif
2501	ret
2502
2503#ifdef __x86_64__
2504.align 16
2505.Lbswap_mask:
2506	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2507
2508/*
2509 * _aesni_inc_init:	internal ABI
2510 *	setup registers used by _aesni_inc
2511 * input:
2512 *	IV
2513 * output:
2514 *	CTR:	== IV, in little endian
2515 *	TCTR_LOW: == lower qword of CTR
2516 *	INC:	== 1, in little endian
2517 *	BSWAP_MASK == endian swapping mask
2518 */
2519.align 4
2520_aesni_inc_init:
2521	movaps .Lbswap_mask, BSWAP_MASK
2522	movaps IV, CTR
2523	PSHUFB_XMM BSWAP_MASK CTR
2524	mov $1, TCTR_LOW
2525	MOVQ_R64_XMM TCTR_LOW INC
2526	MOVQ_R64_XMM CTR TCTR_LOW
2527	ret
2528
2529/*
2530 * _aesni_inc:		internal ABI
2531 *	Increase IV by 1, IV is in big endian
2532 * input:
2533 *	IV
2534 *	CTR:	== IV, in little endian
2535 *	TCTR_LOW: == lower qword of CTR
2536 *	INC:	== 1, in little endian
2537 *	BSWAP_MASK == endian swapping mask
2538 * output:
2539 *	IV:	Increase by 1
2540 * changed:
2541 *	CTR:	== output IV, in little endian
2542 *	TCTR_LOW: == lower qword of CTR
2543 */
2544.align 4
2545_aesni_inc:
2546	paddq INC, CTR
2547	add $1, TCTR_LOW
2548	jnc .Linc_low
2549	pslldq $8, INC
2550	paddq INC, CTR
2551	psrldq $8, INC
2552.Linc_low:
2553	movaps CTR, IV
2554	PSHUFB_XMM BSWAP_MASK IV
2555	ret
2556
2557/*
2558 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559 *		      size_t len, u8 *iv)
2560 */
2561ENTRY(aesni_ctr_enc)
2562	cmp $16, LEN
2563	jb .Lctr_enc_just_ret
2564	mov 480(KEYP), KLEN
2565	movups (IVP), IV
2566	call _aesni_inc_init
2567	cmp $64, LEN
2568	jb .Lctr_enc_loop1
2569.align 4
2570.Lctr_enc_loop4:
2571	movaps IV, STATE1
2572	call _aesni_inc
2573	movups (INP), IN1
2574	movaps IV, STATE2
2575	call _aesni_inc
2576	movups 0x10(INP), IN2
2577	movaps IV, STATE3
2578	call _aesni_inc
2579	movups 0x20(INP), IN3
2580	movaps IV, STATE4
2581	call _aesni_inc
2582	movups 0x30(INP), IN4
2583	call _aesni_enc4
2584	pxor IN1, STATE1
2585	movups STATE1, (OUTP)
2586	pxor IN2, STATE2
2587	movups STATE2, 0x10(OUTP)
2588	pxor IN3, STATE3
2589	movups STATE3, 0x20(OUTP)
2590	pxor IN4, STATE4
2591	movups STATE4, 0x30(OUTP)
2592	sub $64, LEN
2593	add $64, INP
2594	add $64, OUTP
2595	cmp $64, LEN
2596	jge .Lctr_enc_loop4
2597	cmp $16, LEN
2598	jb .Lctr_enc_ret
2599.align 4
2600.Lctr_enc_loop1:
2601	movaps IV, STATE
2602	call _aesni_inc
2603	movups (INP), IN
2604	call _aesni_enc1
2605	pxor IN, STATE
2606	movups STATE, (OUTP)
2607	sub $16, LEN
2608	add $16, INP
2609	add $16, OUTP
2610	cmp $16, LEN
2611	jge .Lctr_enc_loop1
2612.Lctr_enc_ret:
2613	movups IV, (IVP)
2614.Lctr_enc_just_ret:
2615	ret
2616#endif
2617