1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34
35#ifdef __x86_64__
36.data
37.align 16
38.Lgf128mul_x_ble_mask:
39	.octa 0x00000000000000010000000000000087
40
41POLY:   .octa 0xC2000000000000000000000000000001
42TWOONE: .octa 0x00000001000000000000000000000001
43
44# order of these constants should not change.
45# more specifically, ALL_F should follow SHIFT_MASK,
46# and ZERO should follow ALL_F
47
48SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
49MASK1:      .octa 0x0000000000000000ffffffffffffffff
50MASK2:      .octa 0xffffffffffffffff0000000000000000
51SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
52ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
53ZERO:       .octa 0x00000000000000000000000000000000
54ONE:        .octa 0x00000000000000000000000000000001
55F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
56dec:        .octa 0x1
57enc:        .octa 0x2
58
59
60.text
61
62
63#define	STACK_OFFSET    8*3
64#define	HashKey		16*0	// store HashKey <<1 mod poly here
65#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
66#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
67#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
68#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
69				// bits of  HashKey <<1 mod poly here
70				//(for Karatsuba purposes)
71#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
72				// bits of  HashKey^2 <<1 mod poly here
73				// (for Karatsuba purposes)
74#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
75				// bits of  HashKey^3 <<1 mod poly here
76				// (for Karatsuba purposes)
77#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
78				// bits of  HashKey^4 <<1 mod poly here
79				// (for Karatsuba purposes)
80#define	VARIABLE_OFFSET	16*8
81
82#define arg1 rdi
83#define arg2 rsi
84#define arg3 rdx
85#define arg4 rcx
86#define arg5 r8
87#define arg6 r9
88#define arg7 STACK_OFFSET+8(%r14)
89#define arg8 STACK_OFFSET+16(%r14)
90#define arg9 STACK_OFFSET+24(%r14)
91#define arg10 STACK_OFFSET+32(%r14)
92#endif
93
94
95#define STATE1	%xmm0
96#define STATE2	%xmm4
97#define STATE3	%xmm5
98#define STATE4	%xmm6
99#define STATE	STATE1
100#define IN1	%xmm1
101#define IN2	%xmm7
102#define IN3	%xmm8
103#define IN4	%xmm9
104#define IN	IN1
105#define KEY	%xmm2
106#define IV	%xmm3
107
108#define BSWAP_MASK %xmm10
109#define CTR	%xmm11
110#define INC	%xmm12
111
112#define GF128MUL_MASK %xmm10
113
114#ifdef __x86_64__
115#define AREG	%rax
116#define KEYP	%rdi
117#define OUTP	%rsi
118#define UKEYP	OUTP
119#define INP	%rdx
120#define LEN	%rcx
121#define IVP	%r8
122#define KLEN	%r9d
123#define T1	%r10
124#define TKEYP	T1
125#define T2	%r11
126#define TCTR_LOW T2
127#else
128#define AREG	%eax
129#define KEYP	%edi
130#define OUTP	AREG
131#define UKEYP	OUTP
132#define INP	%edx
133#define LEN	%esi
134#define IVP	%ebp
135#define KLEN	%ebx
136#define T1	%ecx
137#define TKEYP	T1
138#endif
139
140
141#ifdef __x86_64__
142/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
143*
144*
145* Input: A and B (128-bits each, bit-reflected)
146* Output: C = A*B*x mod poly, (i.e. >>1 )
147* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
148* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
149*
150*/
151.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
152	movdqa	  \GH, \TMP1
153	pshufd	  $78, \GH, \TMP2
154	pshufd	  $78, \HK, \TMP3
155	pxor	  \GH, \TMP2            # TMP2 = a1+a0
156	pxor	  \HK, \TMP3            # TMP3 = b1+b0
157	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
158	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
159	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
160	pxor	  \GH, \TMP2
161	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
162	movdqa	  \TMP2, \TMP3
163	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
164	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
165	pxor	  \TMP3, \GH
166	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
167
168        # first phase of the reduction
169
170	movdqa    \GH, \TMP2
171	movdqa    \GH, \TMP3
172	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
173					# in in order to perform
174					# independent shifts
175	pslld     $31, \TMP2            # packed right shift <<31
176	pslld     $30, \TMP3            # packed right shift <<30
177	pslld     $25, \TMP4            # packed right shift <<25
178	pxor      \TMP3, \TMP2          # xor the shifted versions
179	pxor      \TMP4, \TMP2
180	movdqa    \TMP2, \TMP5
181	psrldq    $4, \TMP5             # right shift TMP5 1 DW
182	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
183	pxor      \TMP2, \GH
184
185        # second phase of the reduction
186
187	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
188					# in in order to perform
189					# independent shifts
190	movdqa    \GH,\TMP3
191	movdqa    \GH,\TMP4
192	psrld     $1,\TMP2              # packed left shift >>1
193	psrld     $2,\TMP3              # packed left shift >>2
194	psrld     $7,\TMP4              # packed left shift >>7
195	pxor      \TMP3,\TMP2		# xor the shifted versions
196	pxor      \TMP4,\TMP2
197	pxor      \TMP5, \TMP2
198	pxor      \TMP2, \GH
199	pxor      \TMP1, \GH            # result is in TMP1
200.endm
201
202/*
203* if a = number of total plaintext bytes
204* b = floor(a/16)
205* num_initial_blocks = b mod 4
206* encrypt the initial num_initial_blocks blocks and apply ghash on
207* the ciphertext
208* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
209* are clobbered
210* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
211*/
212
213
214.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
215XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
216	mov	   arg7, %r10           # %r10 = AAD
217	mov	   arg8, %r12           # %r12 = aadLen
218	mov	   %r12, %r11
219	pxor	   %xmm\i, %xmm\i
220_get_AAD_loop\num_initial_blocks\operation:
221	movd	   (%r10), \TMP1
222	pslldq	   $12, \TMP1
223	psrldq	   $4, %xmm\i
224	pxor	   \TMP1, %xmm\i
225	add	   $4, %r10
226	sub	   $4, %r12
227	jne	   _get_AAD_loop\num_initial_blocks\operation
228	cmp	   $16, %r11
229	je	   _get_AAD_loop2_done\num_initial_blocks\operation
230	mov	   $16, %r12
231_get_AAD_loop2\num_initial_blocks\operation:
232	psrldq	   $4, %xmm\i
233	sub	   $4, %r12
234	cmp	   %r11, %r12
235	jne	   _get_AAD_loop2\num_initial_blocks\operation
236_get_AAD_loop2_done\num_initial_blocks\operation:
237        movdqa     SHUF_MASK(%rip), %xmm14
238	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
239
240	xor	   %r11, %r11 # initialise the data pointer offset as zero
241
242        # start AES for num_initial_blocks blocks
243
244	mov	   %arg5, %rax                      # %rax = *Y0
245	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
246        movdqa     SHUF_MASK(%rip), %xmm14
247	PSHUFB_XMM   %xmm14, \XMM0
248
249.if (\i == 5) || (\i == 6) || (\i == 7)
250.irpc index, \i_seq
251	paddd	   ONE(%rip), \XMM0                 # INCR Y0
252	movdqa	   \XMM0, %xmm\index
253        movdqa     SHUF_MASK(%rip), %xmm14
254	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
255
256.endr
257.irpc index, \i_seq
258	pxor	   16*0(%arg1), %xmm\index
259.endr
260.irpc index, \i_seq
261	movaps 0x10(%rdi), \TMP1
262	AESENC     \TMP1, %xmm\index          # Round 1
263.endr
264.irpc index, \i_seq
265	movaps 0x20(%arg1), \TMP1
266	AESENC     \TMP1, %xmm\index          # Round 2
267.endr
268.irpc index, \i_seq
269	movaps 0x30(%arg1), \TMP1
270	AESENC     \TMP1, %xmm\index          # Round 2
271.endr
272.irpc index, \i_seq
273	movaps 0x40(%arg1), \TMP1
274	AESENC     \TMP1, %xmm\index          # Round 2
275.endr
276.irpc index, \i_seq
277	movaps 0x50(%arg1), \TMP1
278	AESENC     \TMP1, %xmm\index          # Round 2
279.endr
280.irpc index, \i_seq
281	movaps 0x60(%arg1), \TMP1
282	AESENC     \TMP1, %xmm\index          # Round 2
283.endr
284.irpc index, \i_seq
285	movaps 0x70(%arg1), \TMP1
286	AESENC     \TMP1, %xmm\index          # Round 2
287.endr
288.irpc index, \i_seq
289	movaps 0x80(%arg1), \TMP1
290	AESENC     \TMP1, %xmm\index          # Round 2
291.endr
292.irpc index, \i_seq
293	movaps 0x90(%arg1), \TMP1
294	AESENC     \TMP1, %xmm\index          # Round 2
295.endr
296.irpc index, \i_seq
297	movaps 0xa0(%arg1), \TMP1
298	AESENCLAST \TMP1, %xmm\index         # Round 10
299.endr
300.irpc index, \i_seq
301	movdqu	   (%arg3 , %r11, 1), \TMP1
302	pxor	   \TMP1, %xmm\index
303	movdqu	   %xmm\index, (%arg2 , %r11, 1)
304	# write back plaintext/ciphertext for num_initial_blocks
305	add	   $16, %r11
306
307	movdqa     \TMP1, %xmm\index
308        movdqa     SHUF_MASK(%rip), %xmm14
309	PSHUFB_XMM	   %xmm14, %xmm\index
310
311		# prepare plaintext/ciphertext for GHASH computation
312.endr
313.endif
314	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315        # apply GHASH on num_initial_blocks blocks
316
317.if \i == 5
318        pxor       %xmm5, %xmm6
319	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
320        pxor       %xmm6, %xmm7
321	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322        pxor       %xmm7, %xmm8
323	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
324.elseif \i == 6
325        pxor       %xmm6, %xmm7
326	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
327        pxor       %xmm7, %xmm8
328	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
329.elseif \i == 7
330        pxor       %xmm7, %xmm8
331	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
332.endif
333	cmp	   $64, %r13
334	jl	_initial_blocks_done\num_initial_blocks\operation
335	# no need for precomputed values
336/*
337*
338* Precomputations for HashKey parallel with encryption of first 4 blocks.
339* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
340*/
341	paddd	   ONE(%rip), \XMM0              # INCR Y0
342	movdqa	   \XMM0, \XMM1
343        movdqa     SHUF_MASK(%rip), %xmm14
344	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
345
346	paddd	   ONE(%rip), \XMM0              # INCR Y0
347	movdqa	   \XMM0, \XMM2
348        movdqa     SHUF_MASK(%rip), %xmm14
349	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
350
351	paddd	   ONE(%rip), \XMM0              # INCR Y0
352	movdqa	   \XMM0, \XMM3
353        movdqa     SHUF_MASK(%rip), %xmm14
354	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
355
356	paddd	   ONE(%rip), \XMM0              # INCR Y0
357	movdqa	   \XMM0, \XMM4
358        movdqa     SHUF_MASK(%rip), %xmm14
359	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
360
361	pxor	   16*0(%arg1), \XMM1
362	pxor	   16*0(%arg1), \XMM2
363	pxor	   16*0(%arg1), \XMM3
364	pxor	   16*0(%arg1), \XMM4
365	movdqa	   \TMP3, \TMP5
366	pshufd	   $78, \TMP3, \TMP1
367	pxor	   \TMP3, \TMP1
368	movdqa	   \TMP1, HashKey_k(%rsp)
369	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
370# TMP5 = HashKey^2<<1 (mod poly)
371	movdqa	   \TMP5, HashKey_2(%rsp)
372# HashKey_2 = HashKey^2<<1 (mod poly)
373	pshufd	   $78, \TMP5, \TMP1
374	pxor	   \TMP5, \TMP1
375	movdqa	   \TMP1, HashKey_2_k(%rsp)
376.irpc index, 1234 # do 4 rounds
377	movaps 0x10*\index(%arg1), \TMP1
378	AESENC	   \TMP1, \XMM1
379	AESENC	   \TMP1, \XMM2
380	AESENC	   \TMP1, \XMM3
381	AESENC	   \TMP1, \XMM4
382.endr
383	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
384# TMP5 = HashKey^3<<1 (mod poly)
385	movdqa	   \TMP5, HashKey_3(%rsp)
386	pshufd	   $78, \TMP5, \TMP1
387	pxor	   \TMP5, \TMP1
388	movdqa	   \TMP1, HashKey_3_k(%rsp)
389.irpc index, 56789 # do next 5 rounds
390	movaps 0x10*\index(%arg1), \TMP1
391	AESENC	   \TMP1, \XMM1
392	AESENC	   \TMP1, \XMM2
393	AESENC	   \TMP1, \XMM3
394	AESENC	   \TMP1, \XMM4
395.endr
396	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
397# TMP5 = HashKey^3<<1 (mod poly)
398	movdqa	   \TMP5, HashKey_4(%rsp)
399	pshufd	   $78, \TMP5, \TMP1
400	pxor	   \TMP5, \TMP1
401	movdqa	   \TMP1, HashKey_4_k(%rsp)
402	movaps 0xa0(%arg1), \TMP2
403	AESENCLAST \TMP2, \XMM1
404	AESENCLAST \TMP2, \XMM2
405	AESENCLAST \TMP2, \XMM3
406	AESENCLAST \TMP2, \XMM4
407	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
408	pxor	   \TMP1, \XMM1
409	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
410	movdqa     \TMP1, \XMM1
411	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
412	pxor	   \TMP1, \XMM2
413	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
414	movdqa     \TMP1, \XMM2
415	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
416	pxor	   \TMP1, \XMM3
417	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
418	movdqa     \TMP1, \XMM3
419	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
420	pxor	   \TMP1, \XMM4
421	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
422	movdqa     \TMP1, \XMM4
423	add	   $64, %r11
424        movdqa     SHUF_MASK(%rip), %xmm14
425	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
426	pxor	   \XMMDst, \XMM1
427# combine GHASHed value with the corresponding ciphertext
428        movdqa     SHUF_MASK(%rip), %xmm14
429	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
430        movdqa     SHUF_MASK(%rip), %xmm14
431	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
432        movdqa     SHUF_MASK(%rip), %xmm14
433	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
434
435_initial_blocks_done\num_initial_blocks\operation:
436
437.endm
438
439
440/*
441* if a = number of total plaintext bytes
442* b = floor(a/16)
443* num_initial_blocks = b mod 4
444* encrypt the initial num_initial_blocks blocks and apply ghash on
445* the ciphertext
446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
447* are clobbered
448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
449*/
450
451
452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454	mov	   arg7, %r10           # %r10 = AAD
455	mov	   arg8, %r12           # %r12 = aadLen
456	mov	   %r12, %r11
457	pxor	   %xmm\i, %xmm\i
458_get_AAD_loop\num_initial_blocks\operation:
459	movd	   (%r10), \TMP1
460	pslldq	   $12, \TMP1
461	psrldq	   $4, %xmm\i
462	pxor	   \TMP1, %xmm\i
463	add	   $4, %r10
464	sub	   $4, %r12
465	jne	   _get_AAD_loop\num_initial_blocks\operation
466	cmp	   $16, %r11
467	je	   _get_AAD_loop2_done\num_initial_blocks\operation
468	mov	   $16, %r12
469_get_AAD_loop2\num_initial_blocks\operation:
470	psrldq	   $4, %xmm\i
471	sub	   $4, %r12
472	cmp	   %r11, %r12
473	jne	   _get_AAD_loop2\num_initial_blocks\operation
474_get_AAD_loop2_done\num_initial_blocks\operation:
475        movdqa     SHUF_MASK(%rip), %xmm14
476	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
477
478	xor	   %r11, %r11 # initialise the data pointer offset as zero
479
480        # start AES for num_initial_blocks blocks
481
482	mov	   %arg5, %rax                      # %rax = *Y0
483	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
484        movdqa     SHUF_MASK(%rip), %xmm14
485	PSHUFB_XMM   %xmm14, \XMM0
486
487.if (\i == 5) || (\i == 6) || (\i == 7)
488.irpc index, \i_seq
489	paddd	   ONE(%rip), \XMM0                 # INCR Y0
490	movdqa	   \XMM0, %xmm\index
491        movdqa     SHUF_MASK(%rip), %xmm14
492	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
493
494.endr
495.irpc index, \i_seq
496	pxor	   16*0(%arg1), %xmm\index
497.endr
498.irpc index, \i_seq
499	movaps 0x10(%rdi), \TMP1
500	AESENC     \TMP1, %xmm\index          # Round 1
501.endr
502.irpc index, \i_seq
503	movaps 0x20(%arg1), \TMP1
504	AESENC     \TMP1, %xmm\index          # Round 2
505.endr
506.irpc index, \i_seq
507	movaps 0x30(%arg1), \TMP1
508	AESENC     \TMP1, %xmm\index          # Round 2
509.endr
510.irpc index, \i_seq
511	movaps 0x40(%arg1), \TMP1
512	AESENC     \TMP1, %xmm\index          # Round 2
513.endr
514.irpc index, \i_seq
515	movaps 0x50(%arg1), \TMP1
516	AESENC     \TMP1, %xmm\index          # Round 2
517.endr
518.irpc index, \i_seq
519	movaps 0x60(%arg1), \TMP1
520	AESENC     \TMP1, %xmm\index          # Round 2
521.endr
522.irpc index, \i_seq
523	movaps 0x70(%arg1), \TMP1
524	AESENC     \TMP1, %xmm\index          # Round 2
525.endr
526.irpc index, \i_seq
527	movaps 0x80(%arg1), \TMP1
528	AESENC     \TMP1, %xmm\index          # Round 2
529.endr
530.irpc index, \i_seq
531	movaps 0x90(%arg1), \TMP1
532	AESENC     \TMP1, %xmm\index          # Round 2
533.endr
534.irpc index, \i_seq
535	movaps 0xa0(%arg1), \TMP1
536	AESENCLAST \TMP1, %xmm\index         # Round 10
537.endr
538.irpc index, \i_seq
539	movdqu	   (%arg3 , %r11, 1), \TMP1
540	pxor	   \TMP1, %xmm\index
541	movdqu	   %xmm\index, (%arg2 , %r11, 1)
542	# write back plaintext/ciphertext for num_initial_blocks
543	add	   $16, %r11
544
545        movdqa     SHUF_MASK(%rip), %xmm14
546	PSHUFB_XMM	   %xmm14, %xmm\index
547
548		# prepare plaintext/ciphertext for GHASH computation
549.endr
550.endif
551	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
552        # apply GHASH on num_initial_blocks blocks
553
554.if \i == 5
555        pxor       %xmm5, %xmm6
556	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
557        pxor       %xmm6, %xmm7
558	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
559        pxor       %xmm7, %xmm8
560	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
561.elseif \i == 6
562        pxor       %xmm6, %xmm7
563	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
564        pxor       %xmm7, %xmm8
565	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
566.elseif \i == 7
567        pxor       %xmm7, %xmm8
568	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
569.endif
570	cmp	   $64, %r13
571	jl	_initial_blocks_done\num_initial_blocks\operation
572	# no need for precomputed values
573/*
574*
575* Precomputations for HashKey parallel with encryption of first 4 blocks.
576* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
577*/
578	paddd	   ONE(%rip), \XMM0              # INCR Y0
579	movdqa	   \XMM0, \XMM1
580        movdqa     SHUF_MASK(%rip), %xmm14
581	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
582
583	paddd	   ONE(%rip), \XMM0              # INCR Y0
584	movdqa	   \XMM0, \XMM2
585        movdqa     SHUF_MASK(%rip), %xmm14
586	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
587
588	paddd	   ONE(%rip), \XMM0              # INCR Y0
589	movdqa	   \XMM0, \XMM3
590        movdqa     SHUF_MASK(%rip), %xmm14
591	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
592
593	paddd	   ONE(%rip), \XMM0              # INCR Y0
594	movdqa	   \XMM0, \XMM4
595        movdqa     SHUF_MASK(%rip), %xmm14
596	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
597
598	pxor	   16*0(%arg1), \XMM1
599	pxor	   16*0(%arg1), \XMM2
600	pxor	   16*0(%arg1), \XMM3
601	pxor	   16*0(%arg1), \XMM4
602	movdqa	   \TMP3, \TMP5
603	pshufd	   $78, \TMP3, \TMP1
604	pxor	   \TMP3, \TMP1
605	movdqa	   \TMP1, HashKey_k(%rsp)
606	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
607# TMP5 = HashKey^2<<1 (mod poly)
608	movdqa	   \TMP5, HashKey_2(%rsp)
609# HashKey_2 = HashKey^2<<1 (mod poly)
610	pshufd	   $78, \TMP5, \TMP1
611	pxor	   \TMP5, \TMP1
612	movdqa	   \TMP1, HashKey_2_k(%rsp)
613.irpc index, 1234 # do 4 rounds
614	movaps 0x10*\index(%arg1), \TMP1
615	AESENC	   \TMP1, \XMM1
616	AESENC	   \TMP1, \XMM2
617	AESENC	   \TMP1, \XMM3
618	AESENC	   \TMP1, \XMM4
619.endr
620	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
621# TMP5 = HashKey^3<<1 (mod poly)
622	movdqa	   \TMP5, HashKey_3(%rsp)
623	pshufd	   $78, \TMP5, \TMP1
624	pxor	   \TMP5, \TMP1
625	movdqa	   \TMP1, HashKey_3_k(%rsp)
626.irpc index, 56789 # do next 5 rounds
627	movaps 0x10*\index(%arg1), \TMP1
628	AESENC	   \TMP1, \XMM1
629	AESENC	   \TMP1, \XMM2
630	AESENC	   \TMP1, \XMM3
631	AESENC	   \TMP1, \XMM4
632.endr
633	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
634# TMP5 = HashKey^3<<1 (mod poly)
635	movdqa	   \TMP5, HashKey_4(%rsp)
636	pshufd	   $78, \TMP5, \TMP1
637	pxor	   \TMP5, \TMP1
638	movdqa	   \TMP1, HashKey_4_k(%rsp)
639	movaps 0xa0(%arg1), \TMP2
640	AESENCLAST \TMP2, \XMM1
641	AESENCLAST \TMP2, \XMM2
642	AESENCLAST \TMP2, \XMM3
643	AESENCLAST \TMP2, \XMM4
644	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
645	pxor	   \TMP1, \XMM1
646	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
647	pxor	   \TMP1, \XMM2
648	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
649	pxor	   \TMP1, \XMM3
650	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
651	pxor	   \TMP1, \XMM4
652	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
653	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
654	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
655	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
656
657	add	   $64, %r11
658        movdqa     SHUF_MASK(%rip), %xmm14
659	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
660	pxor	   \XMMDst, \XMM1
661# combine GHASHed value with the corresponding ciphertext
662        movdqa     SHUF_MASK(%rip), %xmm14
663	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
664        movdqa     SHUF_MASK(%rip), %xmm14
665	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
666        movdqa     SHUF_MASK(%rip), %xmm14
667	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
668
669_initial_blocks_done\num_initial_blocks\operation:
670
671.endm
672
673/*
674* encrypt 4 blocks at a time
675* ghash the 4 previously encrypted ciphertext blocks
676* arg1, %arg2, %arg3 are used as pointers only, not modified
677* %r11 is the data offset value
678*/
679.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
680TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
681
682	movdqa	  \XMM1, \XMM5
683	movdqa	  \XMM2, \XMM6
684	movdqa	  \XMM3, \XMM7
685	movdqa	  \XMM4, \XMM8
686
687        movdqa    SHUF_MASK(%rip), %xmm15
688        # multiply TMP5 * HashKey using karatsuba
689
690	movdqa	  \XMM5, \TMP4
691	pshufd	  $78, \XMM5, \TMP6
692	pxor	  \XMM5, \TMP6
693	paddd     ONE(%rip), \XMM0		# INCR CNT
694	movdqa	  HashKey_4(%rsp), \TMP5
695	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
696	movdqa    \XMM0, \XMM1
697	paddd     ONE(%rip), \XMM0		# INCR CNT
698	movdqa    \XMM0, \XMM2
699	paddd     ONE(%rip), \XMM0		# INCR CNT
700	movdqa    \XMM0, \XMM3
701	paddd     ONE(%rip), \XMM0		# INCR CNT
702	movdqa    \XMM0, \XMM4
703	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
704	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
705	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
706	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
707	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
708
709	pxor	  (%arg1), \XMM1
710	pxor	  (%arg1), \XMM2
711	pxor	  (%arg1), \XMM3
712	pxor	  (%arg1), \XMM4
713	movdqa	  HashKey_4_k(%rsp), \TMP5
714	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
715	movaps 0x10(%arg1), \TMP1
716	AESENC	  \TMP1, \XMM1              # Round 1
717	AESENC	  \TMP1, \XMM2
718	AESENC	  \TMP1, \XMM3
719	AESENC	  \TMP1, \XMM4
720	movaps 0x20(%arg1), \TMP1
721	AESENC	  \TMP1, \XMM1              # Round 2
722	AESENC	  \TMP1, \XMM2
723	AESENC	  \TMP1, \XMM3
724	AESENC	  \TMP1, \XMM4
725	movdqa	  \XMM6, \TMP1
726	pshufd	  $78, \XMM6, \TMP2
727	pxor	  \XMM6, \TMP2
728	movdqa	  HashKey_3(%rsp), \TMP5
729	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
730	movaps 0x30(%arg1), \TMP3
731	AESENC    \TMP3, \XMM1              # Round 3
732	AESENC    \TMP3, \XMM2
733	AESENC    \TMP3, \XMM3
734	AESENC    \TMP3, \XMM4
735	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
736	movaps 0x40(%arg1), \TMP3
737	AESENC	  \TMP3, \XMM1              # Round 4
738	AESENC	  \TMP3, \XMM2
739	AESENC	  \TMP3, \XMM3
740	AESENC	  \TMP3, \XMM4
741	movdqa	  HashKey_3_k(%rsp), \TMP5
742	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
743	movaps 0x50(%arg1), \TMP3
744	AESENC	  \TMP3, \XMM1              # Round 5
745	AESENC	  \TMP3, \XMM2
746	AESENC	  \TMP3, \XMM3
747	AESENC	  \TMP3, \XMM4
748	pxor	  \TMP1, \TMP4
749# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
750	pxor	  \XMM6, \XMM5
751	pxor	  \TMP2, \TMP6
752	movdqa	  \XMM7, \TMP1
753	pshufd	  $78, \XMM7, \TMP2
754	pxor	  \XMM7, \TMP2
755	movdqa	  HashKey_2(%rsp ), \TMP5
756
757        # Multiply TMP5 * HashKey using karatsuba
758
759	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
760	movaps 0x60(%arg1), \TMP3
761	AESENC	  \TMP3, \XMM1              # Round 6
762	AESENC	  \TMP3, \XMM2
763	AESENC	  \TMP3, \XMM3
764	AESENC	  \TMP3, \XMM4
765	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
766	movaps 0x70(%arg1), \TMP3
767	AESENC	  \TMP3, \XMM1             # Round 7
768	AESENC	  \TMP3, \XMM2
769	AESENC	  \TMP3, \XMM3
770	AESENC	  \TMP3, \XMM4
771	movdqa	  HashKey_2_k(%rsp), \TMP5
772	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
773	movaps 0x80(%arg1), \TMP3
774	AESENC	  \TMP3, \XMM1             # Round 8
775	AESENC	  \TMP3, \XMM2
776	AESENC	  \TMP3, \XMM3
777	AESENC	  \TMP3, \XMM4
778	pxor	  \TMP1, \TMP4
779# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
780	pxor	  \XMM7, \XMM5
781	pxor	  \TMP2, \TMP6
782
783        # Multiply XMM8 * HashKey
784        # XMM8 and TMP5 hold the values for the two operands
785
786	movdqa	  \XMM8, \TMP1
787	pshufd	  $78, \XMM8, \TMP2
788	pxor	  \XMM8, \TMP2
789	movdqa	  HashKey(%rsp), \TMP5
790	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
791	movaps 0x90(%arg1), \TMP3
792	AESENC	  \TMP3, \XMM1            # Round 9
793	AESENC	  \TMP3, \XMM2
794	AESENC	  \TMP3, \XMM3
795	AESENC	  \TMP3, \XMM4
796	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
797	movaps 0xa0(%arg1), \TMP3
798	AESENCLAST \TMP3, \XMM1           # Round 10
799	AESENCLAST \TMP3, \XMM2
800	AESENCLAST \TMP3, \XMM3
801	AESENCLAST \TMP3, \XMM4
802	movdqa    HashKey_k(%rsp), \TMP5
803	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
804	movdqu	  (%arg3,%r11,1), \TMP3
805	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
806	movdqu	  16(%arg3,%r11,1), \TMP3
807	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
808	movdqu	  32(%arg3,%r11,1), \TMP3
809	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
810	movdqu	  48(%arg3,%r11,1), \TMP3
811	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
812        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
813        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
814        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
815        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
816	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
817	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
818	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
819	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
820
821	pxor	  \TMP4, \TMP1
822	pxor	  \XMM8, \XMM5
823	pxor	  \TMP6, \TMP2
824	pxor	  \TMP1, \TMP2
825	pxor	  \XMM5, \TMP2
826	movdqa	  \TMP2, \TMP3
827	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
828	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
829	pxor	  \TMP3, \XMM5
830	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
831
832        # first phase of reduction
833
834	movdqa    \XMM5, \TMP2
835	movdqa    \XMM5, \TMP3
836	movdqa    \XMM5, \TMP4
837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838	pslld     $31, \TMP2                   # packed right shift << 31
839	pslld     $30, \TMP3                   # packed right shift << 30
840	pslld     $25, \TMP4                   # packed right shift << 25
841	pxor      \TMP3, \TMP2	               # xor the shifted versions
842	pxor      \TMP4, \TMP2
843	movdqa    \TMP2, \TMP5
844	psrldq    $4, \TMP5                    # right shift T5 1 DW
845	pslldq    $12, \TMP2                   # left shift T2 3 DWs
846	pxor      \TMP2, \XMM5
847
848        # second phase of reduction
849
850	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
851	movdqa    \XMM5,\TMP3
852	movdqa    \XMM5,\TMP4
853	psrld     $1, \TMP2                    # packed left shift >>1
854	psrld     $2, \TMP3                    # packed left shift >>2
855	psrld     $7, \TMP4                    # packed left shift >>7
856	pxor      \TMP3,\TMP2		       # xor the shifted versions
857	pxor      \TMP4,\TMP2
858	pxor      \TMP5, \TMP2
859	pxor      \TMP2, \XMM5
860	pxor      \TMP1, \XMM5                 # result is in TMP1
861
862	pxor	  \XMM5, \XMM1
863.endm
864
865/*
866* decrypt 4 blocks at a time
867* ghash the 4 previously decrypted ciphertext blocks
868* arg1, %arg2, %arg3 are used as pointers only, not modified
869* %r11 is the data offset value
870*/
871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
873
874	movdqa	  \XMM1, \XMM5
875	movdqa	  \XMM2, \XMM6
876	movdqa	  \XMM3, \XMM7
877	movdqa	  \XMM4, \XMM8
878
879        movdqa    SHUF_MASK(%rip), %xmm15
880        # multiply TMP5 * HashKey using karatsuba
881
882	movdqa	  \XMM5, \TMP4
883	pshufd	  $78, \XMM5, \TMP6
884	pxor	  \XMM5, \TMP6
885	paddd     ONE(%rip), \XMM0		# INCR CNT
886	movdqa	  HashKey_4(%rsp), \TMP5
887	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
888	movdqa    \XMM0, \XMM1
889	paddd     ONE(%rip), \XMM0		# INCR CNT
890	movdqa    \XMM0, \XMM2
891	paddd     ONE(%rip), \XMM0		# INCR CNT
892	movdqa    \XMM0, \XMM3
893	paddd     ONE(%rip), \XMM0		# INCR CNT
894	movdqa    \XMM0, \XMM4
895	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
896	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
897	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
898	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
899	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
900
901	pxor	  (%arg1), \XMM1
902	pxor	  (%arg1), \XMM2
903	pxor	  (%arg1), \XMM3
904	pxor	  (%arg1), \XMM4
905	movdqa	  HashKey_4_k(%rsp), \TMP5
906	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
907	movaps 0x10(%arg1), \TMP1
908	AESENC	  \TMP1, \XMM1              # Round 1
909	AESENC	  \TMP1, \XMM2
910	AESENC	  \TMP1, \XMM3
911	AESENC	  \TMP1, \XMM4
912	movaps 0x20(%arg1), \TMP1
913	AESENC	  \TMP1, \XMM1              # Round 2
914	AESENC	  \TMP1, \XMM2
915	AESENC	  \TMP1, \XMM3
916	AESENC	  \TMP1, \XMM4
917	movdqa	  \XMM6, \TMP1
918	pshufd	  $78, \XMM6, \TMP2
919	pxor	  \XMM6, \TMP2
920	movdqa	  HashKey_3(%rsp), \TMP5
921	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
922	movaps 0x30(%arg1), \TMP3
923	AESENC    \TMP3, \XMM1              # Round 3
924	AESENC    \TMP3, \XMM2
925	AESENC    \TMP3, \XMM3
926	AESENC    \TMP3, \XMM4
927	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
928	movaps 0x40(%arg1), \TMP3
929	AESENC	  \TMP3, \XMM1              # Round 4
930	AESENC	  \TMP3, \XMM2
931	AESENC	  \TMP3, \XMM3
932	AESENC	  \TMP3, \XMM4
933	movdqa	  HashKey_3_k(%rsp), \TMP5
934	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
935	movaps 0x50(%arg1), \TMP3
936	AESENC	  \TMP3, \XMM1              # Round 5
937	AESENC	  \TMP3, \XMM2
938	AESENC	  \TMP3, \XMM3
939	AESENC	  \TMP3, \XMM4
940	pxor	  \TMP1, \TMP4
941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
942	pxor	  \XMM6, \XMM5
943	pxor	  \TMP2, \TMP6
944	movdqa	  \XMM7, \TMP1
945	pshufd	  $78, \XMM7, \TMP2
946	pxor	  \XMM7, \TMP2
947	movdqa	  HashKey_2(%rsp ), \TMP5
948
949        # Multiply TMP5 * HashKey using karatsuba
950
951	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
952	movaps 0x60(%arg1), \TMP3
953	AESENC	  \TMP3, \XMM1              # Round 6
954	AESENC	  \TMP3, \XMM2
955	AESENC	  \TMP3, \XMM3
956	AESENC	  \TMP3, \XMM4
957	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
958	movaps 0x70(%arg1), \TMP3
959	AESENC	  \TMP3, \XMM1             # Round 7
960	AESENC	  \TMP3, \XMM2
961	AESENC	  \TMP3, \XMM3
962	AESENC	  \TMP3, \XMM4
963	movdqa	  HashKey_2_k(%rsp), \TMP5
964	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
965	movaps 0x80(%arg1), \TMP3
966	AESENC	  \TMP3, \XMM1             # Round 8
967	AESENC	  \TMP3, \XMM2
968	AESENC	  \TMP3, \XMM3
969	AESENC	  \TMP3, \XMM4
970	pxor	  \TMP1, \TMP4
971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
972	pxor	  \XMM7, \XMM5
973	pxor	  \TMP2, \TMP6
974
975        # Multiply XMM8 * HashKey
976        # XMM8 and TMP5 hold the values for the two operands
977
978	movdqa	  \XMM8, \TMP1
979	pshufd	  $78, \XMM8, \TMP2
980	pxor	  \XMM8, \TMP2
981	movdqa	  HashKey(%rsp), \TMP5
982	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
983	movaps 0x90(%arg1), \TMP3
984	AESENC	  \TMP3, \XMM1            # Round 9
985	AESENC	  \TMP3, \XMM2
986	AESENC	  \TMP3, \XMM3
987	AESENC	  \TMP3, \XMM4
988	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
989	movaps 0xa0(%arg1), \TMP3
990	AESENCLAST \TMP3, \XMM1           # Round 10
991	AESENCLAST \TMP3, \XMM2
992	AESENCLAST \TMP3, \XMM3
993	AESENCLAST \TMP3, \XMM4
994	movdqa    HashKey_k(%rsp), \TMP5
995	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
996	movdqu	  (%arg3,%r11,1), \TMP3
997	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
998	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
999	movdqa    \TMP3, \XMM1
1000	movdqu	  16(%arg3,%r11,1), \TMP3
1001	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1002	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1003	movdqa    \TMP3, \XMM2
1004	movdqu	  32(%arg3,%r11,1), \TMP3
1005	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1006	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1007	movdqa    \TMP3, \XMM3
1008	movdqu	  48(%arg3,%r11,1), \TMP3
1009	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1010	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1011	movdqa    \TMP3, \XMM4
1012	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1013	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1014	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1015	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1016
1017	pxor	  \TMP4, \TMP1
1018	pxor	  \XMM8, \XMM5
1019	pxor	  \TMP6, \TMP2
1020	pxor	  \TMP1, \TMP2
1021	pxor	  \XMM5, \TMP2
1022	movdqa	  \TMP2, \TMP3
1023	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1024	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1025	pxor	  \TMP3, \XMM5
1026	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1027
1028        # first phase of reduction
1029
1030	movdqa    \XMM5, \TMP2
1031	movdqa    \XMM5, \TMP3
1032	movdqa    \XMM5, \TMP4
1033# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1034	pslld     $31, \TMP2                   # packed right shift << 31
1035	pslld     $30, \TMP3                   # packed right shift << 30
1036	pslld     $25, \TMP4                   # packed right shift << 25
1037	pxor      \TMP3, \TMP2	               # xor the shifted versions
1038	pxor      \TMP4, \TMP2
1039	movdqa    \TMP2, \TMP5
1040	psrldq    $4, \TMP5                    # right shift T5 1 DW
1041	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1042	pxor      \TMP2, \XMM5
1043
1044        # second phase of reduction
1045
1046	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1047	movdqa    \XMM5,\TMP3
1048	movdqa    \XMM5,\TMP4
1049	psrld     $1, \TMP2                    # packed left shift >>1
1050	psrld     $2, \TMP3                    # packed left shift >>2
1051	psrld     $7, \TMP4                    # packed left shift >>7
1052	pxor      \TMP3,\TMP2		       # xor the shifted versions
1053	pxor      \TMP4,\TMP2
1054	pxor      \TMP5, \TMP2
1055	pxor      \TMP2, \XMM5
1056	pxor      \TMP1, \XMM5                 # result is in TMP1
1057
1058	pxor	  \XMM5, \XMM1
1059.endm
1060
1061/* GHASH the last 4 ciphertext blocks. */
1062.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1063TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1064
1065        # Multiply TMP6 * HashKey (using Karatsuba)
1066
1067	movdqa	  \XMM1, \TMP6
1068	pshufd	  $78, \XMM1, \TMP2
1069	pxor	  \XMM1, \TMP2
1070	movdqa	  HashKey_4(%rsp), \TMP5
1071	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1072	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1073	movdqa	  HashKey_4_k(%rsp), \TMP4
1074	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1075	movdqa	  \XMM1, \XMMDst
1076	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1077
1078        # Multiply TMP1 * HashKey (using Karatsuba)
1079
1080	movdqa	  \XMM2, \TMP1
1081	pshufd	  $78, \XMM2, \TMP2
1082	pxor	  \XMM2, \TMP2
1083	movdqa	  HashKey_3(%rsp), \TMP5
1084	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1085	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1086	movdqa	  HashKey_3_k(%rsp), \TMP4
1087	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1088	pxor	  \TMP1, \TMP6
1089	pxor	  \XMM2, \XMMDst
1090	pxor	  \TMP2, \XMM1
1091# results accumulated in TMP6, XMMDst, XMM1
1092
1093        # Multiply TMP1 * HashKey (using Karatsuba)
1094
1095	movdqa	  \XMM3, \TMP1
1096	pshufd	  $78, \XMM3, \TMP2
1097	pxor	  \XMM3, \TMP2
1098	movdqa	  HashKey_2(%rsp), \TMP5
1099	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1100	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1101	movdqa	  HashKey_2_k(%rsp), \TMP4
1102	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1103	pxor	  \TMP1, \TMP6
1104	pxor	  \XMM3, \XMMDst
1105	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1106
1107        # Multiply TMP1 * HashKey (using Karatsuba)
1108	movdqa	  \XMM4, \TMP1
1109	pshufd	  $78, \XMM4, \TMP2
1110	pxor	  \XMM4, \TMP2
1111	movdqa	  HashKey(%rsp), \TMP5
1112	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1113	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1114	movdqa	  HashKey_k(%rsp), \TMP4
1115	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1116	pxor	  \TMP1, \TMP6
1117	pxor	  \XMM4, \XMMDst
1118	pxor	  \XMM1, \TMP2
1119	pxor	  \TMP6, \TMP2
1120	pxor	  \XMMDst, \TMP2
1121	# middle section of the temp results combined as in karatsuba algorithm
1122	movdqa	  \TMP2, \TMP4
1123	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1124	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1125	pxor	  \TMP4, \XMMDst
1126	pxor	  \TMP2, \TMP6
1127# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1128	# first phase of the reduction
1129	movdqa    \XMMDst, \TMP2
1130	movdqa    \XMMDst, \TMP3
1131	movdqa    \XMMDst, \TMP4
1132# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1133	pslld     $31, \TMP2                # packed right shifting << 31
1134	pslld     $30, \TMP3                # packed right shifting << 30
1135	pslld     $25, \TMP4                # packed right shifting << 25
1136	pxor      \TMP3, \TMP2              # xor the shifted versions
1137	pxor      \TMP4, \TMP2
1138	movdqa    \TMP2, \TMP7
1139	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1140	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1141	pxor      \TMP2, \XMMDst
1142
1143        # second phase of the reduction
1144	movdqa    \XMMDst, \TMP2
1145	# make 3 copies of XMMDst for doing 3 shift operations
1146	movdqa    \XMMDst, \TMP3
1147	movdqa    \XMMDst, \TMP4
1148	psrld     $1, \TMP2                 # packed left shift >> 1
1149	psrld     $2, \TMP3                 # packed left shift >> 2
1150	psrld     $7, \TMP4                 # packed left shift >> 7
1151	pxor      \TMP3, \TMP2              # xor the shifted versions
1152	pxor      \TMP4, \TMP2
1153	pxor      \TMP7, \TMP2
1154	pxor      \TMP2, \XMMDst
1155	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1156.endm
1157
1158/* Encryption of a single block done*/
1159.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1160
1161	pxor	(%arg1), \XMM0
1162        movaps 16(%arg1), \TMP1
1163	AESENC	\TMP1, \XMM0
1164        movaps 32(%arg1), \TMP1
1165	AESENC	\TMP1, \XMM0
1166        movaps 48(%arg1), \TMP1
1167	AESENC	\TMP1, \XMM0
1168        movaps 64(%arg1), \TMP1
1169	AESENC	\TMP1, \XMM0
1170        movaps 80(%arg1), \TMP1
1171	AESENC	\TMP1, \XMM0
1172        movaps 96(%arg1), \TMP1
1173	AESENC	\TMP1, \XMM0
1174        movaps 112(%arg1), \TMP1
1175	AESENC	\TMP1, \XMM0
1176        movaps 128(%arg1), \TMP1
1177	AESENC	\TMP1, \XMM0
1178        movaps 144(%arg1), \TMP1
1179	AESENC	\TMP1, \XMM0
1180        movaps 160(%arg1), \TMP1
1181	AESENCLAST	\TMP1, \XMM0
1182.endm
1183
1184
1185/*****************************************************************************
1186* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1187*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1188*                   const u8 *in,      // Ciphertext input
1189*                   u64 plaintext_len, // Length of data in bytes for decryption.
1190*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1191*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1192*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1193*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1194*                   const u8 *aad,     // Additional Authentication Data (AAD)
1195*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1196*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1197*                                      // given authentication tag and only return the plaintext if they match.
1198*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1199*                                      // (most likely), 12 or 8.
1200*
1201* Assumptions:
1202*
1203* keys:
1204*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1205*       set of 11 keys in the data structure void *aes_ctx
1206*
1207* iv:
1208*       0                   1                   2                   3
1209*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1210*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1211*       |                             Salt  (From the SA)               |
1212*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1213*       |                     Initialization Vector                     |
1214*       |         (This is the sequence number from IPSec header)       |
1215*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1216*       |                              0x1                              |
1217*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1218*
1219*
1220*
1221* AAD:
1222*       AAD padded to 128 bits with 0
1223*       for example, assume AAD is a u32 vector
1224*
1225*       if AAD is 8 bytes:
1226*       AAD[3] = {A0, A1};
1227*       padded AAD in xmm register = {A1 A0 0 0}
1228*
1229*       0                   1                   2                   3
1230*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1231*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*       |                               SPI (A1)                        |
1233*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1234*       |                     32-bit Sequence Number (A0)               |
1235*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1236*       |                              0x0                              |
1237*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1238*
1239*                                       AAD Format with 32-bit Sequence Number
1240*
1241*       if AAD is 12 bytes:
1242*       AAD[3] = {A0, A1, A2};
1243*       padded AAD in xmm register = {A2 A1 A0 0}
1244*
1245*       0                   1                   2                   3
1246*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1247*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1249*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250*       |                               SPI (A2)                        |
1251*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1252*       |                 64-bit Extended Sequence Number {A1,A0}       |
1253*       |                                                               |
1254*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1255*       |                              0x0                              |
1256*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1257*
1258*                        AAD Format with 64-bit Extended Sequence Number
1259*
1260* aadLen:
1261*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1262*       The code supports 16 too but for other sizes, the code will fail.
1263*
1264* TLen:
1265*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1266*       For other sizes, the code will fail.
1267*
1268* poly = x^128 + x^127 + x^126 + x^121 + 1
1269*
1270*****************************************************************************/
1271ENTRY(aesni_gcm_dec)
1272	push	%r12
1273	push	%r13
1274	push	%r14
1275	mov	%rsp, %r14
1276/*
1277* states of %xmm registers %xmm6:%xmm15 not saved
1278* all %xmm registers are clobbered
1279*/
1280	sub	$VARIABLE_OFFSET, %rsp
1281	and	$~63, %rsp                        # align rsp to 64 bytes
1282	mov	%arg6, %r12
1283	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1284        movdqa  SHUF_MASK(%rip), %xmm2
1285	PSHUFB_XMM %xmm2, %xmm13
1286
1287
1288# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1289
1290	movdqa	%xmm13, %xmm2
1291	psllq	$1, %xmm13
1292	psrlq	$63, %xmm2
1293	movdqa	%xmm2, %xmm1
1294	pslldq	$8, %xmm2
1295	psrldq	$8, %xmm1
1296	por	%xmm2, %xmm13
1297
1298        # Reduction
1299
1300	pshufd	$0x24, %xmm1, %xmm2
1301	pcmpeqd TWOONE(%rip), %xmm2
1302	pand	POLY(%rip), %xmm2
1303	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1304
1305
1306        # Decrypt first few blocks
1307
1308	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1309	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1310	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1311	mov %r13, %r12
1312	and $(3<<4), %r12
1313	jz _initial_num_blocks_is_0_decrypt
1314	cmp $(2<<4), %r12
1315	jb _initial_num_blocks_is_1_decrypt
1316	je _initial_num_blocks_is_2_decrypt
1317_initial_num_blocks_is_3_decrypt:
1318	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1320	sub	$48, %r13
1321	jmp	_initial_blocks_decrypted
1322_initial_num_blocks_is_2_decrypt:
1323	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1325	sub	$32, %r13
1326	jmp	_initial_blocks_decrypted
1327_initial_num_blocks_is_1_decrypt:
1328	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1330	sub	$16, %r13
1331	jmp	_initial_blocks_decrypted
1332_initial_num_blocks_is_0_decrypt:
1333	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1334%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1335_initial_blocks_decrypted:
1336	cmp	$0, %r13
1337	je	_zero_cipher_left_decrypt
1338	sub	$64, %r13
1339	je	_four_cipher_left_decrypt
1340_decrypt_by_4:
1341	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1342%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1343	add	$64, %r11
1344	sub	$64, %r13
1345	jne	_decrypt_by_4
1346_four_cipher_left_decrypt:
1347	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1348%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1349_zero_cipher_left_decrypt:
1350	mov	%arg4, %r13
1351	and	$15, %r13				# %r13 = arg4 (mod 16)
1352	je	_multiple_of_16_bytes_decrypt
1353
1354        # Handle the last <16 byte block separately
1355
1356	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1357        movdqa SHUF_MASK(%rip), %xmm10
1358	PSHUFB_XMM %xmm10, %xmm0
1359
1360	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1361	sub $16, %r11
1362	add %r13, %r11
1363	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1364	lea SHIFT_MASK+16(%rip), %r12
1365	sub %r13, %r12
1366# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1367# (%r13 is the number of bytes in plaintext mod 16)
1368	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1369	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1370
1371	movdqa  %xmm1, %xmm2
1372	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1373	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1374	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1375	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1376	pand    %xmm1, %xmm2
1377        movdqa SHUF_MASK(%rip), %xmm10
1378	PSHUFB_XMM %xmm10 ,%xmm2
1379
1380	pxor %xmm2, %xmm8
1381	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1382	          # GHASH computation for the last <16 byte block
1383	sub %r13, %r11
1384	add $16, %r11
1385
1386        # output %r13 bytes
1387	MOVQ_R64_XMM	%xmm0, %rax
1388	cmp	$8, %r13
1389	jle	_less_than_8_bytes_left_decrypt
1390	mov	%rax, (%arg2 , %r11, 1)
1391	add	$8, %r11
1392	psrldq	$8, %xmm0
1393	MOVQ_R64_XMM	%xmm0, %rax
1394	sub	$8, %r13
1395_less_than_8_bytes_left_decrypt:
1396	mov	%al,  (%arg2, %r11, 1)
1397	add	$1, %r11
1398	shr	$8, %rax
1399	sub	$1, %r13
1400	jne	_less_than_8_bytes_left_decrypt
1401_multiple_of_16_bytes_decrypt:
1402	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1403	shl	$3, %r12		  # convert into number of bits
1404	movd	%r12d, %xmm15		  # len(A) in %xmm15
1405	shl	$3, %arg4		  # len(C) in bits (*128)
1406	MOVQ_R64_XMM	%arg4, %xmm1
1407	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1408	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1409	pxor	%xmm15, %xmm8
1410	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1411	         # final GHASH computation
1412        movdqa SHUF_MASK(%rip), %xmm10
1413	PSHUFB_XMM %xmm10, %xmm8
1414
1415	mov	%arg5, %rax		  # %rax = *Y0
1416	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1417	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1418	pxor	%xmm8, %xmm0
1419_return_T_decrypt:
1420	mov	arg9, %r10                # %r10 = authTag
1421	mov	arg10, %r11               # %r11 = auth_tag_len
1422	cmp	$16, %r11
1423	je	_T_16_decrypt
1424	cmp	$12, %r11
1425	je	_T_12_decrypt
1426_T_8_decrypt:
1427	MOVQ_R64_XMM	%xmm0, %rax
1428	mov	%rax, (%r10)
1429	jmp	_return_T_done_decrypt
1430_T_12_decrypt:
1431	MOVQ_R64_XMM	%xmm0, %rax
1432	mov	%rax, (%r10)
1433	psrldq	$8, %xmm0
1434	movd	%xmm0, %eax
1435	mov	%eax, 8(%r10)
1436	jmp	_return_T_done_decrypt
1437_T_16_decrypt:
1438	movdqu	%xmm0, (%r10)
1439_return_T_done_decrypt:
1440	mov	%r14, %rsp
1441	pop	%r14
1442	pop	%r13
1443	pop	%r12
1444	ret
1445ENDPROC(aesni_gcm_dec)
1446
1447
1448/*****************************************************************************
1449* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1450*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1451*                    const u8 *in,       // Plaintext input
1452*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1453*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1454*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1455*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1456*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1457*                    const u8 *aad,      // Additional Authentication Data (AAD)
1458*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1459*                    u8 *auth_tag,       // Authenticated Tag output.
1460*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1461*                                        // 12 or 8.
1462*
1463* Assumptions:
1464*
1465* keys:
1466*       keys are pre-expanded and aligned to 16 bytes. we are using the
1467*       first set of 11 keys in the data structure void *aes_ctx
1468*
1469*
1470* iv:
1471*       0                   1                   2                   3
1472*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1473*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1474*       |                             Salt  (From the SA)               |
1475*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1476*       |                     Initialization Vector                     |
1477*       |         (This is the sequence number from IPSec header)       |
1478*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1479*       |                              0x1                              |
1480*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1481*
1482*
1483*
1484* AAD:
1485*       AAD padded to 128 bits with 0
1486*       for example, assume AAD is a u32 vector
1487*
1488*       if AAD is 8 bytes:
1489*       AAD[3] = {A0, A1};
1490*       padded AAD in xmm register = {A1 A0 0 0}
1491*
1492*       0                   1                   2                   3
1493*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1494*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*       |                               SPI (A1)                        |
1496*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1497*       |                     32-bit Sequence Number (A0)               |
1498*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1499*       |                              0x0                              |
1500*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1501*
1502*                                 AAD Format with 32-bit Sequence Number
1503*
1504*       if AAD is 12 bytes:
1505*       AAD[3] = {A0, A1, A2};
1506*       padded AAD in xmm register = {A2 A1 A0 0}
1507*
1508*       0                   1                   2                   3
1509*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1510*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511*       |                               SPI (A2)                        |
1512*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513*       |                 64-bit Extended Sequence Number {A1,A0}       |
1514*       |                                                               |
1515*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1516*       |                              0x0                              |
1517*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1518*
1519*                         AAD Format with 64-bit Extended Sequence Number
1520*
1521* aadLen:
1522*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1523*       The code supports 16 too but for other sizes, the code will fail.
1524*
1525* TLen:
1526*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1527*       For other sizes, the code will fail.
1528*
1529* poly = x^128 + x^127 + x^126 + x^121 + 1
1530***************************************************************************/
1531ENTRY(aesni_gcm_enc)
1532	push	%r12
1533	push	%r13
1534	push	%r14
1535	mov	%rsp, %r14
1536#
1537# states of %xmm registers %xmm6:%xmm15 not saved
1538# all %xmm registers are clobbered
1539#
1540	sub	$VARIABLE_OFFSET, %rsp
1541	and	$~63, %rsp
1542	mov	%arg6, %r12
1543	movdqu	(%r12), %xmm13
1544        movdqa  SHUF_MASK(%rip), %xmm2
1545	PSHUFB_XMM %xmm2, %xmm13
1546
1547
1548# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1549
1550	movdqa	%xmm13, %xmm2
1551	psllq	$1, %xmm13
1552	psrlq	$63, %xmm2
1553	movdqa	%xmm2, %xmm1
1554	pslldq	$8, %xmm2
1555	psrldq	$8, %xmm1
1556	por	%xmm2, %xmm13
1557
1558        # reduce HashKey<<1
1559
1560	pshufd	$0x24, %xmm1, %xmm2
1561	pcmpeqd TWOONE(%rip), %xmm2
1562	pand	POLY(%rip), %xmm2
1563	pxor	%xmm2, %xmm13
1564	movdqa	%xmm13, HashKey(%rsp)
1565	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1566	and	$-16, %r13
1567	mov	%r13, %r12
1568
1569        # Encrypt first few blocks
1570
1571	and	$(3<<4), %r12
1572	jz	_initial_num_blocks_is_0_encrypt
1573	cmp	$(2<<4), %r12
1574	jb	_initial_num_blocks_is_1_encrypt
1575	je	_initial_num_blocks_is_2_encrypt
1576_initial_num_blocks_is_3_encrypt:
1577	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1578%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1579	sub	$48, %r13
1580	jmp	_initial_blocks_encrypted
1581_initial_num_blocks_is_2_encrypt:
1582	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1583%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1584	sub	$32, %r13
1585	jmp	_initial_blocks_encrypted
1586_initial_num_blocks_is_1_encrypt:
1587	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1588%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1589	sub	$16, %r13
1590	jmp	_initial_blocks_encrypted
1591_initial_num_blocks_is_0_encrypt:
1592	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1593%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1594_initial_blocks_encrypted:
1595
1596        # Main loop - Encrypt remaining blocks
1597
1598	cmp	$0, %r13
1599	je	_zero_cipher_left_encrypt
1600	sub	$64, %r13
1601	je	_four_cipher_left_encrypt
1602_encrypt_by_4_encrypt:
1603	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1604%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1605	add	$64, %r11
1606	sub	$64, %r13
1607	jne	_encrypt_by_4_encrypt
1608_four_cipher_left_encrypt:
1609	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1610%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1611_zero_cipher_left_encrypt:
1612	mov	%arg4, %r13
1613	and	$15, %r13			# %r13 = arg4 (mod 16)
1614	je	_multiple_of_16_bytes_encrypt
1615
1616         # Handle the last <16 Byte block separately
1617	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1618        movdqa SHUF_MASK(%rip), %xmm10
1619	PSHUFB_XMM %xmm10, %xmm0
1620
1621
1622	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1623	sub $16, %r11
1624	add %r13, %r11
1625	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1626	lea SHIFT_MASK+16(%rip), %r12
1627	sub %r13, %r12
1628	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1629	# (%r13 is the number of bytes in plaintext mod 16)
1630	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1631	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1632	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1633	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1634	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1635	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1636        movdqa SHUF_MASK(%rip), %xmm10
1637	PSHUFB_XMM %xmm10,%xmm0
1638
1639	pxor	%xmm0, %xmm8
1640	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1641	# GHASH computation for the last <16 byte block
1642	sub	%r13, %r11
1643	add	$16, %r11
1644
1645	movdqa SHUF_MASK(%rip), %xmm10
1646	PSHUFB_XMM %xmm10, %xmm0
1647
1648	# shuffle xmm0 back to output as ciphertext
1649
1650        # Output %r13 bytes
1651	MOVQ_R64_XMM %xmm0, %rax
1652	cmp $8, %r13
1653	jle _less_than_8_bytes_left_encrypt
1654	mov %rax, (%arg2 , %r11, 1)
1655	add $8, %r11
1656	psrldq $8, %xmm0
1657	MOVQ_R64_XMM %xmm0, %rax
1658	sub $8, %r13
1659_less_than_8_bytes_left_encrypt:
1660	mov %al,  (%arg2, %r11, 1)
1661	add $1, %r11
1662	shr $8, %rax
1663	sub $1, %r13
1664	jne _less_than_8_bytes_left_encrypt
1665_multiple_of_16_bytes_encrypt:
1666	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1667	shl	$3, %r12
1668	movd	%r12d, %xmm15       # len(A) in %xmm15
1669	shl	$3, %arg4               # len(C) in bits (*128)
1670	MOVQ_R64_XMM	%arg4, %xmm1
1671	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1672	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1673	pxor	%xmm15, %xmm8
1674	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1675	# final GHASH computation
1676        movdqa SHUF_MASK(%rip), %xmm10
1677	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1678
1679	mov	%arg5, %rax		       # %rax  = *Y0
1680	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1681	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1682	pxor	%xmm8, %xmm0
1683_return_T_encrypt:
1684	mov	arg9, %r10                     # %r10 = authTag
1685	mov	arg10, %r11                    # %r11 = auth_tag_len
1686	cmp	$16, %r11
1687	je	_T_16_encrypt
1688	cmp	$12, %r11
1689	je	_T_12_encrypt
1690_T_8_encrypt:
1691	MOVQ_R64_XMM	%xmm0, %rax
1692	mov	%rax, (%r10)
1693	jmp	_return_T_done_encrypt
1694_T_12_encrypt:
1695	MOVQ_R64_XMM	%xmm0, %rax
1696	mov	%rax, (%r10)
1697	psrldq	$8, %xmm0
1698	movd	%xmm0, %eax
1699	mov	%eax, 8(%r10)
1700	jmp	_return_T_done_encrypt
1701_T_16_encrypt:
1702	movdqu	%xmm0, (%r10)
1703_return_T_done_encrypt:
1704	mov	%r14, %rsp
1705	pop	%r14
1706	pop	%r13
1707	pop	%r12
1708	ret
1709ENDPROC(aesni_gcm_enc)
1710
1711#endif
1712
1713
1714.align 4
1715_key_expansion_128:
1716_key_expansion_256a:
1717	pshufd $0b11111111, %xmm1, %xmm1
1718	shufps $0b00010000, %xmm0, %xmm4
1719	pxor %xmm4, %xmm0
1720	shufps $0b10001100, %xmm0, %xmm4
1721	pxor %xmm4, %xmm0
1722	pxor %xmm1, %xmm0
1723	movaps %xmm0, (TKEYP)
1724	add $0x10, TKEYP
1725	ret
1726ENDPROC(_key_expansion_128)
1727ENDPROC(_key_expansion_256a)
1728
1729.align 4
1730_key_expansion_192a:
1731	pshufd $0b01010101, %xmm1, %xmm1
1732	shufps $0b00010000, %xmm0, %xmm4
1733	pxor %xmm4, %xmm0
1734	shufps $0b10001100, %xmm0, %xmm4
1735	pxor %xmm4, %xmm0
1736	pxor %xmm1, %xmm0
1737
1738	movaps %xmm2, %xmm5
1739	movaps %xmm2, %xmm6
1740	pslldq $4, %xmm5
1741	pshufd $0b11111111, %xmm0, %xmm3
1742	pxor %xmm3, %xmm2
1743	pxor %xmm5, %xmm2
1744
1745	movaps %xmm0, %xmm1
1746	shufps $0b01000100, %xmm0, %xmm6
1747	movaps %xmm6, (TKEYP)
1748	shufps $0b01001110, %xmm2, %xmm1
1749	movaps %xmm1, 0x10(TKEYP)
1750	add $0x20, TKEYP
1751	ret
1752ENDPROC(_key_expansion_192a)
1753
1754.align 4
1755_key_expansion_192b:
1756	pshufd $0b01010101, %xmm1, %xmm1
1757	shufps $0b00010000, %xmm0, %xmm4
1758	pxor %xmm4, %xmm0
1759	shufps $0b10001100, %xmm0, %xmm4
1760	pxor %xmm4, %xmm0
1761	pxor %xmm1, %xmm0
1762
1763	movaps %xmm2, %xmm5
1764	pslldq $4, %xmm5
1765	pshufd $0b11111111, %xmm0, %xmm3
1766	pxor %xmm3, %xmm2
1767	pxor %xmm5, %xmm2
1768
1769	movaps %xmm0, (TKEYP)
1770	add $0x10, TKEYP
1771	ret
1772ENDPROC(_key_expansion_192b)
1773
1774.align 4
1775_key_expansion_256b:
1776	pshufd $0b10101010, %xmm1, %xmm1
1777	shufps $0b00010000, %xmm2, %xmm4
1778	pxor %xmm4, %xmm2
1779	shufps $0b10001100, %xmm2, %xmm4
1780	pxor %xmm4, %xmm2
1781	pxor %xmm1, %xmm2
1782	movaps %xmm2, (TKEYP)
1783	add $0x10, TKEYP
1784	ret
1785ENDPROC(_key_expansion_256b)
1786
1787/*
1788 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1789 *                   unsigned int key_len)
1790 */
1791ENTRY(aesni_set_key)
1792#ifndef __x86_64__
1793	pushl KEYP
1794	movl 8(%esp), KEYP		# ctx
1795	movl 12(%esp), UKEYP		# in_key
1796	movl 16(%esp), %edx		# key_len
1797#endif
1798	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1799	movaps %xmm0, (KEYP)
1800	lea 0x10(KEYP), TKEYP		# key addr
1801	movl %edx, 480(KEYP)
1802	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1803	cmp $24, %dl
1804	jb .Lenc_key128
1805	je .Lenc_key192
1806	movups 0x10(UKEYP), %xmm2	# other user key
1807	movaps %xmm2, (TKEYP)
1808	add $0x10, TKEYP
1809	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1810	call _key_expansion_256a
1811	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1812	call _key_expansion_256b
1813	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1814	call _key_expansion_256a
1815	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1816	call _key_expansion_256b
1817	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1818	call _key_expansion_256a
1819	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1820	call _key_expansion_256b
1821	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1822	call _key_expansion_256a
1823	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1824	call _key_expansion_256b
1825	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1826	call _key_expansion_256a
1827	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1828	call _key_expansion_256b
1829	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1830	call _key_expansion_256a
1831	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1832	call _key_expansion_256b
1833	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1834	call _key_expansion_256a
1835	jmp .Ldec_key
1836.Lenc_key192:
1837	movq 0x10(UKEYP), %xmm2		# other user key
1838	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1839	call _key_expansion_192a
1840	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1841	call _key_expansion_192b
1842	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1843	call _key_expansion_192a
1844	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1845	call _key_expansion_192b
1846	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1847	call _key_expansion_192a
1848	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1849	call _key_expansion_192b
1850	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1851	call _key_expansion_192a
1852	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1853	call _key_expansion_192b
1854	jmp .Ldec_key
1855.Lenc_key128:
1856	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1857	call _key_expansion_128
1858	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1859	call _key_expansion_128
1860	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1861	call _key_expansion_128
1862	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1863	call _key_expansion_128
1864	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1865	call _key_expansion_128
1866	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1867	call _key_expansion_128
1868	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1869	call _key_expansion_128
1870	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1871	call _key_expansion_128
1872	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1873	call _key_expansion_128
1874	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1875	call _key_expansion_128
1876.Ldec_key:
1877	sub $0x10, TKEYP
1878	movaps (KEYP), %xmm0
1879	movaps (TKEYP), %xmm1
1880	movaps %xmm0, 240(TKEYP)
1881	movaps %xmm1, 240(KEYP)
1882	add $0x10, KEYP
1883	lea 240-16(TKEYP), UKEYP
1884.align 4
1885.Ldec_key_loop:
1886	movaps (KEYP), %xmm0
1887	AESIMC %xmm0 %xmm1
1888	movaps %xmm1, (UKEYP)
1889	add $0x10, KEYP
1890	sub $0x10, UKEYP
1891	cmp TKEYP, KEYP
1892	jb .Ldec_key_loop
1893	xor AREG, AREG
1894#ifndef __x86_64__
1895	popl KEYP
1896#endif
1897	ret
1898ENDPROC(aesni_set_key)
1899
1900/*
1901 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1902 */
1903ENTRY(aesni_enc)
1904#ifndef __x86_64__
1905	pushl KEYP
1906	pushl KLEN
1907	movl 12(%esp), KEYP
1908	movl 16(%esp), OUTP
1909	movl 20(%esp), INP
1910#endif
1911	movl 480(KEYP), KLEN		# key length
1912	movups (INP), STATE		# input
1913	call _aesni_enc1
1914	movups STATE, (OUTP)		# output
1915#ifndef __x86_64__
1916	popl KLEN
1917	popl KEYP
1918#endif
1919	ret
1920ENDPROC(aesni_enc)
1921
1922/*
1923 * _aesni_enc1:		internal ABI
1924 * input:
1925 *	KEYP:		key struct pointer
1926 *	KLEN:		round count
1927 *	STATE:		initial state (input)
1928 * output:
1929 *	STATE:		finial state (output)
1930 * changed:
1931 *	KEY
1932 *	TKEYP (T1)
1933 */
1934.align 4
1935_aesni_enc1:
1936	movaps (KEYP), KEY		# key
1937	mov KEYP, TKEYP
1938	pxor KEY, STATE		# round 0
1939	add $0x30, TKEYP
1940	cmp $24, KLEN
1941	jb .Lenc128
1942	lea 0x20(TKEYP), TKEYP
1943	je .Lenc192
1944	add $0x20, TKEYP
1945	movaps -0x60(TKEYP), KEY
1946	AESENC KEY STATE
1947	movaps -0x50(TKEYP), KEY
1948	AESENC KEY STATE
1949.align 4
1950.Lenc192:
1951	movaps -0x40(TKEYP), KEY
1952	AESENC KEY STATE
1953	movaps -0x30(TKEYP), KEY
1954	AESENC KEY STATE
1955.align 4
1956.Lenc128:
1957	movaps -0x20(TKEYP), KEY
1958	AESENC KEY STATE
1959	movaps -0x10(TKEYP), KEY
1960	AESENC KEY STATE
1961	movaps (TKEYP), KEY
1962	AESENC KEY STATE
1963	movaps 0x10(TKEYP), KEY
1964	AESENC KEY STATE
1965	movaps 0x20(TKEYP), KEY
1966	AESENC KEY STATE
1967	movaps 0x30(TKEYP), KEY
1968	AESENC KEY STATE
1969	movaps 0x40(TKEYP), KEY
1970	AESENC KEY STATE
1971	movaps 0x50(TKEYP), KEY
1972	AESENC KEY STATE
1973	movaps 0x60(TKEYP), KEY
1974	AESENC KEY STATE
1975	movaps 0x70(TKEYP), KEY
1976	AESENCLAST KEY STATE
1977	ret
1978ENDPROC(_aesni_enc1)
1979
1980/*
1981 * _aesni_enc4:	internal ABI
1982 * input:
1983 *	KEYP:		key struct pointer
1984 *	KLEN:		round count
1985 *	STATE1:		initial state (input)
1986 *	STATE2
1987 *	STATE3
1988 *	STATE4
1989 * output:
1990 *	STATE1:		finial state (output)
1991 *	STATE2
1992 *	STATE3
1993 *	STATE4
1994 * changed:
1995 *	KEY
1996 *	TKEYP (T1)
1997 */
1998.align 4
1999_aesni_enc4:
2000	movaps (KEYP), KEY		# key
2001	mov KEYP, TKEYP
2002	pxor KEY, STATE1		# round 0
2003	pxor KEY, STATE2
2004	pxor KEY, STATE3
2005	pxor KEY, STATE4
2006	add $0x30, TKEYP
2007	cmp $24, KLEN
2008	jb .L4enc128
2009	lea 0x20(TKEYP), TKEYP
2010	je .L4enc192
2011	add $0x20, TKEYP
2012	movaps -0x60(TKEYP), KEY
2013	AESENC KEY STATE1
2014	AESENC KEY STATE2
2015	AESENC KEY STATE3
2016	AESENC KEY STATE4
2017	movaps -0x50(TKEYP), KEY
2018	AESENC KEY STATE1
2019	AESENC KEY STATE2
2020	AESENC KEY STATE3
2021	AESENC KEY STATE4
2022#.align 4
2023.L4enc192:
2024	movaps -0x40(TKEYP), KEY
2025	AESENC KEY STATE1
2026	AESENC KEY STATE2
2027	AESENC KEY STATE3
2028	AESENC KEY STATE4
2029	movaps -0x30(TKEYP), KEY
2030	AESENC KEY STATE1
2031	AESENC KEY STATE2
2032	AESENC KEY STATE3
2033	AESENC KEY STATE4
2034#.align 4
2035.L4enc128:
2036	movaps -0x20(TKEYP), KEY
2037	AESENC KEY STATE1
2038	AESENC KEY STATE2
2039	AESENC KEY STATE3
2040	AESENC KEY STATE4
2041	movaps -0x10(TKEYP), KEY
2042	AESENC KEY STATE1
2043	AESENC KEY STATE2
2044	AESENC KEY STATE3
2045	AESENC KEY STATE4
2046	movaps (TKEYP), KEY
2047	AESENC KEY STATE1
2048	AESENC KEY STATE2
2049	AESENC KEY STATE3
2050	AESENC KEY STATE4
2051	movaps 0x10(TKEYP), KEY
2052	AESENC KEY STATE1
2053	AESENC KEY STATE2
2054	AESENC KEY STATE3
2055	AESENC KEY STATE4
2056	movaps 0x20(TKEYP), KEY
2057	AESENC KEY STATE1
2058	AESENC KEY STATE2
2059	AESENC KEY STATE3
2060	AESENC KEY STATE4
2061	movaps 0x30(TKEYP), KEY
2062	AESENC KEY STATE1
2063	AESENC KEY STATE2
2064	AESENC KEY STATE3
2065	AESENC KEY STATE4
2066	movaps 0x40(TKEYP), KEY
2067	AESENC KEY STATE1
2068	AESENC KEY STATE2
2069	AESENC KEY STATE3
2070	AESENC KEY STATE4
2071	movaps 0x50(TKEYP), KEY
2072	AESENC KEY STATE1
2073	AESENC KEY STATE2
2074	AESENC KEY STATE3
2075	AESENC KEY STATE4
2076	movaps 0x60(TKEYP), KEY
2077	AESENC KEY STATE1
2078	AESENC KEY STATE2
2079	AESENC KEY STATE3
2080	AESENC KEY STATE4
2081	movaps 0x70(TKEYP), KEY
2082	AESENCLAST KEY STATE1		# last round
2083	AESENCLAST KEY STATE2
2084	AESENCLAST KEY STATE3
2085	AESENCLAST KEY STATE4
2086	ret
2087ENDPROC(_aesni_enc4)
2088
2089/*
2090 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2091 */
2092ENTRY(aesni_dec)
2093#ifndef __x86_64__
2094	pushl KEYP
2095	pushl KLEN
2096	movl 12(%esp), KEYP
2097	movl 16(%esp), OUTP
2098	movl 20(%esp), INP
2099#endif
2100	mov 480(KEYP), KLEN		# key length
2101	add $240, KEYP
2102	movups (INP), STATE		# input
2103	call _aesni_dec1
2104	movups STATE, (OUTP)		#output
2105#ifndef __x86_64__
2106	popl KLEN
2107	popl KEYP
2108#endif
2109	ret
2110ENDPROC(aesni_dec)
2111
2112/*
2113 * _aesni_dec1:		internal ABI
2114 * input:
2115 *	KEYP:		key struct pointer
2116 *	KLEN:		key length
2117 *	STATE:		initial state (input)
2118 * output:
2119 *	STATE:		finial state (output)
2120 * changed:
2121 *	KEY
2122 *	TKEYP (T1)
2123 */
2124.align 4
2125_aesni_dec1:
2126	movaps (KEYP), KEY		# key
2127	mov KEYP, TKEYP
2128	pxor KEY, STATE		# round 0
2129	add $0x30, TKEYP
2130	cmp $24, KLEN
2131	jb .Ldec128
2132	lea 0x20(TKEYP), TKEYP
2133	je .Ldec192
2134	add $0x20, TKEYP
2135	movaps -0x60(TKEYP), KEY
2136	AESDEC KEY STATE
2137	movaps -0x50(TKEYP), KEY
2138	AESDEC KEY STATE
2139.align 4
2140.Ldec192:
2141	movaps -0x40(TKEYP), KEY
2142	AESDEC KEY STATE
2143	movaps -0x30(TKEYP), KEY
2144	AESDEC KEY STATE
2145.align 4
2146.Ldec128:
2147	movaps -0x20(TKEYP), KEY
2148	AESDEC KEY STATE
2149	movaps -0x10(TKEYP), KEY
2150	AESDEC KEY STATE
2151	movaps (TKEYP), KEY
2152	AESDEC KEY STATE
2153	movaps 0x10(TKEYP), KEY
2154	AESDEC KEY STATE
2155	movaps 0x20(TKEYP), KEY
2156	AESDEC KEY STATE
2157	movaps 0x30(TKEYP), KEY
2158	AESDEC KEY STATE
2159	movaps 0x40(TKEYP), KEY
2160	AESDEC KEY STATE
2161	movaps 0x50(TKEYP), KEY
2162	AESDEC KEY STATE
2163	movaps 0x60(TKEYP), KEY
2164	AESDEC KEY STATE
2165	movaps 0x70(TKEYP), KEY
2166	AESDECLAST KEY STATE
2167	ret
2168ENDPROC(_aesni_dec1)
2169
2170/*
2171 * _aesni_dec4:	internal ABI
2172 * input:
2173 *	KEYP:		key struct pointer
2174 *	KLEN:		key length
2175 *	STATE1:		initial state (input)
2176 *	STATE2
2177 *	STATE3
2178 *	STATE4
2179 * output:
2180 *	STATE1:		finial state (output)
2181 *	STATE2
2182 *	STATE3
2183 *	STATE4
2184 * changed:
2185 *	KEY
2186 *	TKEYP (T1)
2187 */
2188.align 4
2189_aesni_dec4:
2190	movaps (KEYP), KEY		# key
2191	mov KEYP, TKEYP
2192	pxor KEY, STATE1		# round 0
2193	pxor KEY, STATE2
2194	pxor KEY, STATE3
2195	pxor KEY, STATE4
2196	add $0x30, TKEYP
2197	cmp $24, KLEN
2198	jb .L4dec128
2199	lea 0x20(TKEYP), TKEYP
2200	je .L4dec192
2201	add $0x20, TKEYP
2202	movaps -0x60(TKEYP), KEY
2203	AESDEC KEY STATE1
2204	AESDEC KEY STATE2
2205	AESDEC KEY STATE3
2206	AESDEC KEY STATE4
2207	movaps -0x50(TKEYP), KEY
2208	AESDEC KEY STATE1
2209	AESDEC KEY STATE2
2210	AESDEC KEY STATE3
2211	AESDEC KEY STATE4
2212.align 4
2213.L4dec192:
2214	movaps -0x40(TKEYP), KEY
2215	AESDEC KEY STATE1
2216	AESDEC KEY STATE2
2217	AESDEC KEY STATE3
2218	AESDEC KEY STATE4
2219	movaps -0x30(TKEYP), KEY
2220	AESDEC KEY STATE1
2221	AESDEC KEY STATE2
2222	AESDEC KEY STATE3
2223	AESDEC KEY STATE4
2224.align 4
2225.L4dec128:
2226	movaps -0x20(TKEYP), KEY
2227	AESDEC KEY STATE1
2228	AESDEC KEY STATE2
2229	AESDEC KEY STATE3
2230	AESDEC KEY STATE4
2231	movaps -0x10(TKEYP), KEY
2232	AESDEC KEY STATE1
2233	AESDEC KEY STATE2
2234	AESDEC KEY STATE3
2235	AESDEC KEY STATE4
2236	movaps (TKEYP), KEY
2237	AESDEC KEY STATE1
2238	AESDEC KEY STATE2
2239	AESDEC KEY STATE3
2240	AESDEC KEY STATE4
2241	movaps 0x10(TKEYP), KEY
2242	AESDEC KEY STATE1
2243	AESDEC KEY STATE2
2244	AESDEC KEY STATE3
2245	AESDEC KEY STATE4
2246	movaps 0x20(TKEYP), KEY
2247	AESDEC KEY STATE1
2248	AESDEC KEY STATE2
2249	AESDEC KEY STATE3
2250	AESDEC KEY STATE4
2251	movaps 0x30(TKEYP), KEY
2252	AESDEC KEY STATE1
2253	AESDEC KEY STATE2
2254	AESDEC KEY STATE3
2255	AESDEC KEY STATE4
2256	movaps 0x40(TKEYP), KEY
2257	AESDEC KEY STATE1
2258	AESDEC KEY STATE2
2259	AESDEC KEY STATE3
2260	AESDEC KEY STATE4
2261	movaps 0x50(TKEYP), KEY
2262	AESDEC KEY STATE1
2263	AESDEC KEY STATE2
2264	AESDEC KEY STATE3
2265	AESDEC KEY STATE4
2266	movaps 0x60(TKEYP), KEY
2267	AESDEC KEY STATE1
2268	AESDEC KEY STATE2
2269	AESDEC KEY STATE3
2270	AESDEC KEY STATE4
2271	movaps 0x70(TKEYP), KEY
2272	AESDECLAST KEY STATE1		# last round
2273	AESDECLAST KEY STATE2
2274	AESDECLAST KEY STATE3
2275	AESDECLAST KEY STATE4
2276	ret
2277ENDPROC(_aesni_dec4)
2278
2279/*
2280 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2281 *		      size_t len)
2282 */
2283ENTRY(aesni_ecb_enc)
2284#ifndef __x86_64__
2285	pushl LEN
2286	pushl KEYP
2287	pushl KLEN
2288	movl 16(%esp), KEYP
2289	movl 20(%esp), OUTP
2290	movl 24(%esp), INP
2291	movl 28(%esp), LEN
2292#endif
2293	test LEN, LEN		# check length
2294	jz .Lecb_enc_ret
2295	mov 480(KEYP), KLEN
2296	cmp $16, LEN
2297	jb .Lecb_enc_ret
2298	cmp $64, LEN
2299	jb .Lecb_enc_loop1
2300.align 4
2301.Lecb_enc_loop4:
2302	movups (INP), STATE1
2303	movups 0x10(INP), STATE2
2304	movups 0x20(INP), STATE3
2305	movups 0x30(INP), STATE4
2306	call _aesni_enc4
2307	movups STATE1, (OUTP)
2308	movups STATE2, 0x10(OUTP)
2309	movups STATE3, 0x20(OUTP)
2310	movups STATE4, 0x30(OUTP)
2311	sub $64, LEN
2312	add $64, INP
2313	add $64, OUTP
2314	cmp $64, LEN
2315	jge .Lecb_enc_loop4
2316	cmp $16, LEN
2317	jb .Lecb_enc_ret
2318.align 4
2319.Lecb_enc_loop1:
2320	movups (INP), STATE1
2321	call _aesni_enc1
2322	movups STATE1, (OUTP)
2323	sub $16, LEN
2324	add $16, INP
2325	add $16, OUTP
2326	cmp $16, LEN
2327	jge .Lecb_enc_loop1
2328.Lecb_enc_ret:
2329#ifndef __x86_64__
2330	popl KLEN
2331	popl KEYP
2332	popl LEN
2333#endif
2334	ret
2335ENDPROC(aesni_ecb_enc)
2336
2337/*
2338 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2339 *		      size_t len);
2340 */
2341ENTRY(aesni_ecb_dec)
2342#ifndef __x86_64__
2343	pushl LEN
2344	pushl KEYP
2345	pushl KLEN
2346	movl 16(%esp), KEYP
2347	movl 20(%esp), OUTP
2348	movl 24(%esp), INP
2349	movl 28(%esp), LEN
2350#endif
2351	test LEN, LEN
2352	jz .Lecb_dec_ret
2353	mov 480(KEYP), KLEN
2354	add $240, KEYP
2355	cmp $16, LEN
2356	jb .Lecb_dec_ret
2357	cmp $64, LEN
2358	jb .Lecb_dec_loop1
2359.align 4
2360.Lecb_dec_loop4:
2361	movups (INP), STATE1
2362	movups 0x10(INP), STATE2
2363	movups 0x20(INP), STATE3
2364	movups 0x30(INP), STATE4
2365	call _aesni_dec4
2366	movups STATE1, (OUTP)
2367	movups STATE2, 0x10(OUTP)
2368	movups STATE3, 0x20(OUTP)
2369	movups STATE4, 0x30(OUTP)
2370	sub $64, LEN
2371	add $64, INP
2372	add $64, OUTP
2373	cmp $64, LEN
2374	jge .Lecb_dec_loop4
2375	cmp $16, LEN
2376	jb .Lecb_dec_ret
2377.align 4
2378.Lecb_dec_loop1:
2379	movups (INP), STATE1
2380	call _aesni_dec1
2381	movups STATE1, (OUTP)
2382	sub $16, LEN
2383	add $16, INP
2384	add $16, OUTP
2385	cmp $16, LEN
2386	jge .Lecb_dec_loop1
2387.Lecb_dec_ret:
2388#ifndef __x86_64__
2389	popl KLEN
2390	popl KEYP
2391	popl LEN
2392#endif
2393	ret
2394ENDPROC(aesni_ecb_dec)
2395
2396/*
2397 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2398 *		      size_t len, u8 *iv)
2399 */
2400ENTRY(aesni_cbc_enc)
2401#ifndef __x86_64__
2402	pushl IVP
2403	pushl LEN
2404	pushl KEYP
2405	pushl KLEN
2406	movl 20(%esp), KEYP
2407	movl 24(%esp), OUTP
2408	movl 28(%esp), INP
2409	movl 32(%esp), LEN
2410	movl 36(%esp), IVP
2411#endif
2412	cmp $16, LEN
2413	jb .Lcbc_enc_ret
2414	mov 480(KEYP), KLEN
2415	movups (IVP), STATE	# load iv as initial state
2416.align 4
2417.Lcbc_enc_loop:
2418	movups (INP), IN	# load input
2419	pxor IN, STATE
2420	call _aesni_enc1
2421	movups STATE, (OUTP)	# store output
2422	sub $16, LEN
2423	add $16, INP
2424	add $16, OUTP
2425	cmp $16, LEN
2426	jge .Lcbc_enc_loop
2427	movups STATE, (IVP)
2428.Lcbc_enc_ret:
2429#ifndef __x86_64__
2430	popl KLEN
2431	popl KEYP
2432	popl LEN
2433	popl IVP
2434#endif
2435	ret
2436ENDPROC(aesni_cbc_enc)
2437
2438/*
2439 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2440 *		      size_t len, u8 *iv)
2441 */
2442ENTRY(aesni_cbc_dec)
2443#ifndef __x86_64__
2444	pushl IVP
2445	pushl LEN
2446	pushl KEYP
2447	pushl KLEN
2448	movl 20(%esp), KEYP
2449	movl 24(%esp), OUTP
2450	movl 28(%esp), INP
2451	movl 32(%esp), LEN
2452	movl 36(%esp), IVP
2453#endif
2454	cmp $16, LEN
2455	jb .Lcbc_dec_just_ret
2456	mov 480(KEYP), KLEN
2457	add $240, KEYP
2458	movups (IVP), IV
2459	cmp $64, LEN
2460	jb .Lcbc_dec_loop1
2461.align 4
2462.Lcbc_dec_loop4:
2463	movups (INP), IN1
2464	movaps IN1, STATE1
2465	movups 0x10(INP), IN2
2466	movaps IN2, STATE2
2467#ifdef __x86_64__
2468	movups 0x20(INP), IN3
2469	movaps IN3, STATE3
2470	movups 0x30(INP), IN4
2471	movaps IN4, STATE4
2472#else
2473	movups 0x20(INP), IN1
2474	movaps IN1, STATE3
2475	movups 0x30(INP), IN2
2476	movaps IN2, STATE4
2477#endif
2478	call _aesni_dec4
2479	pxor IV, STATE1
2480#ifdef __x86_64__
2481	pxor IN1, STATE2
2482	pxor IN2, STATE3
2483	pxor IN3, STATE4
2484	movaps IN4, IV
2485#else
2486	pxor IN1, STATE4
2487	movaps IN2, IV
2488	movups (INP), IN1
2489	pxor IN1, STATE2
2490	movups 0x10(INP), IN2
2491	pxor IN2, STATE3
2492#endif
2493	movups STATE1, (OUTP)
2494	movups STATE2, 0x10(OUTP)
2495	movups STATE3, 0x20(OUTP)
2496	movups STATE4, 0x30(OUTP)
2497	sub $64, LEN
2498	add $64, INP
2499	add $64, OUTP
2500	cmp $64, LEN
2501	jge .Lcbc_dec_loop4
2502	cmp $16, LEN
2503	jb .Lcbc_dec_ret
2504.align 4
2505.Lcbc_dec_loop1:
2506	movups (INP), IN
2507	movaps IN, STATE
2508	call _aesni_dec1
2509	pxor IV, STATE
2510	movups STATE, (OUTP)
2511	movaps IN, IV
2512	sub $16, LEN
2513	add $16, INP
2514	add $16, OUTP
2515	cmp $16, LEN
2516	jge .Lcbc_dec_loop1
2517.Lcbc_dec_ret:
2518	movups IV, (IVP)
2519.Lcbc_dec_just_ret:
2520#ifndef __x86_64__
2521	popl KLEN
2522	popl KEYP
2523	popl LEN
2524	popl IVP
2525#endif
2526	ret
2527ENDPROC(aesni_cbc_dec)
2528
2529#ifdef __x86_64__
2530.align 16
2531.Lbswap_mask:
2532	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2533
2534/*
2535 * _aesni_inc_init:	internal ABI
2536 *	setup registers used by _aesni_inc
2537 * input:
2538 *	IV
2539 * output:
2540 *	CTR:	== IV, in little endian
2541 *	TCTR_LOW: == lower qword of CTR
2542 *	INC:	== 1, in little endian
2543 *	BSWAP_MASK == endian swapping mask
2544 */
2545.align 4
2546_aesni_inc_init:
2547	movaps .Lbswap_mask, BSWAP_MASK
2548	movaps IV, CTR
2549	PSHUFB_XMM BSWAP_MASK CTR
2550	mov $1, TCTR_LOW
2551	MOVQ_R64_XMM TCTR_LOW INC
2552	MOVQ_R64_XMM CTR TCTR_LOW
2553	ret
2554ENDPROC(_aesni_inc_init)
2555
2556/*
2557 * _aesni_inc:		internal ABI
2558 *	Increase IV by 1, IV is in big endian
2559 * input:
2560 *	IV
2561 *	CTR:	== IV, in little endian
2562 *	TCTR_LOW: == lower qword of CTR
2563 *	INC:	== 1, in little endian
2564 *	BSWAP_MASK == endian swapping mask
2565 * output:
2566 *	IV:	Increase by 1
2567 * changed:
2568 *	CTR:	== output IV, in little endian
2569 *	TCTR_LOW: == lower qword of CTR
2570 */
2571.align 4
2572_aesni_inc:
2573	paddq INC, CTR
2574	add $1, TCTR_LOW
2575	jnc .Linc_low
2576	pslldq $8, INC
2577	paddq INC, CTR
2578	psrldq $8, INC
2579.Linc_low:
2580	movaps CTR, IV
2581	PSHUFB_XMM BSWAP_MASK IV
2582	ret
2583ENDPROC(_aesni_inc)
2584
2585/*
2586 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2587 *		      size_t len, u8 *iv)
2588 */
2589ENTRY(aesni_ctr_enc)
2590	cmp $16, LEN
2591	jb .Lctr_enc_just_ret
2592	mov 480(KEYP), KLEN
2593	movups (IVP), IV
2594	call _aesni_inc_init
2595	cmp $64, LEN
2596	jb .Lctr_enc_loop1
2597.align 4
2598.Lctr_enc_loop4:
2599	movaps IV, STATE1
2600	call _aesni_inc
2601	movups (INP), IN1
2602	movaps IV, STATE2
2603	call _aesni_inc
2604	movups 0x10(INP), IN2
2605	movaps IV, STATE3
2606	call _aesni_inc
2607	movups 0x20(INP), IN3
2608	movaps IV, STATE4
2609	call _aesni_inc
2610	movups 0x30(INP), IN4
2611	call _aesni_enc4
2612	pxor IN1, STATE1
2613	movups STATE1, (OUTP)
2614	pxor IN2, STATE2
2615	movups STATE2, 0x10(OUTP)
2616	pxor IN3, STATE3
2617	movups STATE3, 0x20(OUTP)
2618	pxor IN4, STATE4
2619	movups STATE4, 0x30(OUTP)
2620	sub $64, LEN
2621	add $64, INP
2622	add $64, OUTP
2623	cmp $64, LEN
2624	jge .Lctr_enc_loop4
2625	cmp $16, LEN
2626	jb .Lctr_enc_ret
2627.align 4
2628.Lctr_enc_loop1:
2629	movaps IV, STATE
2630	call _aesni_inc
2631	movups (INP), IN
2632	call _aesni_enc1
2633	pxor IN, STATE
2634	movups STATE, (OUTP)
2635	sub $16, LEN
2636	add $16, INP
2637	add $16, OUTP
2638	cmp $16, LEN
2639	jge .Lctr_enc_loop1
2640.Lctr_enc_ret:
2641	movups IV, (IVP)
2642.Lctr_enc_just_ret:
2643	ret
2644ENDPROC(aesni_ctr_enc)
2645
2646/*
2647 * _aesni_gf128mul_x_ble:		internal ABI
2648 *	Multiply in GF(2^128) for XTS IVs
2649 * input:
2650 *	IV:	current IV
2651 *	GF128MUL_MASK == mask with 0x87 and 0x01
2652 * output:
2653 *	IV:	next IV
2654 * changed:
2655 *	CTR:	== temporary value
2656 */
2657#define _aesni_gf128mul_x_ble() \
2658	pshufd $0x13, IV, CTR; \
2659	paddq IV, IV; \
2660	psrad $31, CTR; \
2661	pand GF128MUL_MASK, CTR; \
2662	pxor CTR, IV;
2663
2664/*
2665 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2666 *			 bool enc, u8 *iv)
2667 */
2668ENTRY(aesni_xts_crypt8)
2669	cmpb $0, %cl
2670	movl $0, %ecx
2671	movl $240, %r10d
2672	leaq _aesni_enc4, %r11
2673	leaq _aesni_dec4, %rax
2674	cmovel %r10d, %ecx
2675	cmoveq %rax, %r11
2676
2677	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2678	movups (IVP), IV
2679
2680	mov 480(KEYP), KLEN
2681	addq %rcx, KEYP
2682
2683	movdqa IV, STATE1
2684	movdqu 0x00(INP), INC
2685	pxor INC, STATE1
2686	movdqu IV, 0x00(OUTP)
2687
2688	_aesni_gf128mul_x_ble()
2689	movdqa IV, STATE2
2690	movdqu 0x10(INP), INC
2691	pxor INC, STATE2
2692	movdqu IV, 0x10(OUTP)
2693
2694	_aesni_gf128mul_x_ble()
2695	movdqa IV, STATE3
2696	movdqu 0x20(INP), INC
2697	pxor INC, STATE3
2698	movdqu IV, 0x20(OUTP)
2699
2700	_aesni_gf128mul_x_ble()
2701	movdqa IV, STATE4
2702	movdqu 0x30(INP), INC
2703	pxor INC, STATE4
2704	movdqu IV, 0x30(OUTP)
2705
2706	call *%r11
2707
2708	movdqu 0x00(OUTP), INC
2709	pxor INC, STATE1
2710	movdqu STATE1, 0x00(OUTP)
2711
2712	_aesni_gf128mul_x_ble()
2713	movdqa IV, STATE1
2714	movdqu 0x40(INP), INC
2715	pxor INC, STATE1
2716	movdqu IV, 0x40(OUTP)
2717
2718	movdqu 0x10(OUTP), INC
2719	pxor INC, STATE2
2720	movdqu STATE2, 0x10(OUTP)
2721
2722	_aesni_gf128mul_x_ble()
2723	movdqa IV, STATE2
2724	movdqu 0x50(INP), INC
2725	pxor INC, STATE2
2726	movdqu IV, 0x50(OUTP)
2727
2728	movdqu 0x20(OUTP), INC
2729	pxor INC, STATE3
2730	movdqu STATE3, 0x20(OUTP)
2731
2732	_aesni_gf128mul_x_ble()
2733	movdqa IV, STATE3
2734	movdqu 0x60(INP), INC
2735	pxor INC, STATE3
2736	movdqu IV, 0x60(OUTP)
2737
2738	movdqu 0x30(OUTP), INC
2739	pxor INC, STATE4
2740	movdqu STATE4, 0x30(OUTP)
2741
2742	_aesni_gf128mul_x_ble()
2743	movdqa IV, STATE4
2744	movdqu 0x70(INP), INC
2745	pxor INC, STATE4
2746	movdqu IV, 0x70(OUTP)
2747
2748	_aesni_gf128mul_x_ble()
2749	movups IV, (IVP)
2750
2751	call *%r11
2752
2753	movdqu 0x40(OUTP), INC
2754	pxor INC, STATE1
2755	movdqu STATE1, 0x40(OUTP)
2756
2757	movdqu 0x50(OUTP), INC
2758	pxor INC, STATE2
2759	movdqu STATE2, 0x50(OUTP)
2760
2761	movdqu 0x60(OUTP), INC
2762	pxor INC, STATE3
2763	movdqu STATE3, 0x60(OUTP)
2764
2765	movdqu 0x70(OUTP), INC
2766	pxor INC, STATE4
2767	movdqu STATE4, 0x70(OUTP)
2768
2769	ret
2770ENDPROC(aesni_xts_crypt8)
2771
2772#endif
2773