xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision 4da722ca19f30f7db250db808d1ab1703607a932)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
16 *             Adrian Hoban <adrian.hoban@intel.com>
17 *             James Guilford (james.guilford@intel.com)
18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
19 *             Tadeusz Struk (tadeusz.struk@intel.com)
20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
21 *    Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 *    Author: Mathias Krause <minipli@googlemail.com>
25 *
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
30 */
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35
36/*
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register.  This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned).  It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
43 */
44#define MOVADQ	movaps
45#define MOVUDQ	movups
46
47#ifdef __x86_64__
48
49# constants in mergeable sections, linker can reorder and merge
50.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
51.align 16
52.Lgf128mul_x_ble_mask:
53	.octa 0x00000000000000010000000000000087
54.section	.rodata.cst16.POLY, "aM", @progbits, 16
55.align 16
56POLY:   .octa 0xC2000000000000000000000000000001
57.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
58.align 16
59TWOONE: .octa 0x00000001000000000000000000000001
60
61.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
62.align 16
63SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
64.section	.rodata.cst16.MASK1, "aM", @progbits, 16
65.align 16
66MASK1:      .octa 0x0000000000000000ffffffffffffffff
67.section	.rodata.cst16.MASK2, "aM", @progbits, 16
68.align 16
69MASK2:      .octa 0xffffffffffffffff0000000000000000
70.section	.rodata.cst16.ONE, "aM", @progbits, 16
71.align 16
72ONE:        .octa 0x00000000000000000000000000000001
73.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
74.align 16
75F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76.section	.rodata.cst16.dec, "aM", @progbits, 16
77.align 16
78dec:        .octa 0x1
79.section	.rodata.cst16.enc, "aM", @progbits, 16
80.align 16
81enc:        .octa 0x2
82
83# order of these constants should not change.
84# more specifically, ALL_F should follow SHIFT_MASK,
85# and zero should follow ALL_F
86.section	.rodata, "a", @progbits
87.align 16
88SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
90            .octa 0x00000000000000000000000000000000
91
92.section .rodata
93.align 16
94.type aad_shift_arr, @object
95.size aad_shift_arr, 272
96aad_shift_arr:
97        .octa     0xffffffffffffffffffffffffffffffff
98        .octa     0xffffffffffffffffffffffffffffff0C
99        .octa     0xffffffffffffffffffffffffffff0D0C
100        .octa     0xffffffffffffffffffffffffff0E0D0C
101        .octa     0xffffffffffffffffffffffff0F0E0D0C
102        .octa     0xffffffffffffffffffffff0C0B0A0908
103        .octa     0xffffffffffffffffffff0D0C0B0A0908
104        .octa     0xffffffffffffffffff0E0D0C0B0A0908
105        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
106        .octa     0xffffffffffffff0C0B0A090807060504
107        .octa     0xffffffffffff0D0C0B0A090807060504
108        .octa     0xffffffffff0E0D0C0B0A090807060504
109        .octa     0xffffffff0F0E0D0C0B0A090807060504
110        .octa     0xffffff0C0B0A09080706050403020100
111        .octa     0xffff0D0C0B0A09080706050403020100
112        .octa     0xff0E0D0C0B0A09080706050403020100
113        .octa     0x0F0E0D0C0B0A09080706050403020100
114
115
116.text
117
118
119#define	STACK_OFFSET    8*3
120#define	HashKey		16*0	// store HashKey <<1 mod poly here
121#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
122#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
123#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
124#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
125				// bits of  HashKey <<1 mod poly here
126				//(for Karatsuba purposes)
127#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
128				// bits of  HashKey^2 <<1 mod poly here
129				// (for Karatsuba purposes)
130#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
131				// bits of  HashKey^3 <<1 mod poly here
132				// (for Karatsuba purposes)
133#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
134				// bits of  HashKey^4 <<1 mod poly here
135				// (for Karatsuba purposes)
136#define	VARIABLE_OFFSET	16*8
137
138#define arg1 rdi
139#define arg2 rsi
140#define arg3 rdx
141#define arg4 rcx
142#define arg5 r8
143#define arg6 r9
144#define arg7 STACK_OFFSET+8(%r14)
145#define arg8 STACK_OFFSET+16(%r14)
146#define arg9 STACK_OFFSET+24(%r14)
147#define arg10 STACK_OFFSET+32(%r14)
148#define keysize 2*15*16(%arg1)
149#endif
150
151
152#define STATE1	%xmm0
153#define STATE2	%xmm4
154#define STATE3	%xmm5
155#define STATE4	%xmm6
156#define STATE	STATE1
157#define IN1	%xmm1
158#define IN2	%xmm7
159#define IN3	%xmm8
160#define IN4	%xmm9
161#define IN	IN1
162#define KEY	%xmm2
163#define IV	%xmm3
164
165#define BSWAP_MASK %xmm10
166#define CTR	%xmm11
167#define INC	%xmm12
168
169#define GF128MUL_MASK %xmm10
170
171#ifdef __x86_64__
172#define AREG	%rax
173#define KEYP	%rdi
174#define OUTP	%rsi
175#define UKEYP	OUTP
176#define INP	%rdx
177#define LEN	%rcx
178#define IVP	%r8
179#define KLEN	%r9d
180#define T1	%r10
181#define TKEYP	T1
182#define T2	%r11
183#define TCTR_LOW T2
184#else
185#define AREG	%eax
186#define KEYP	%edi
187#define OUTP	AREG
188#define UKEYP	OUTP
189#define INP	%edx
190#define LEN	%esi
191#define IVP	%ebp
192#define KLEN	%ebx
193#define T1	%ecx
194#define TKEYP	T1
195#endif
196
197
198#ifdef __x86_64__
199/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
200*
201*
202* Input: A and B (128-bits each, bit-reflected)
203* Output: C = A*B*x mod poly, (i.e. >>1 )
204* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
205* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
206*
207*/
208.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
209	movdqa	  \GH, \TMP1
210	pshufd	  $78, \GH, \TMP2
211	pshufd	  $78, \HK, \TMP3
212	pxor	  \GH, \TMP2            # TMP2 = a1+a0
213	pxor	  \HK, \TMP3            # TMP3 = b1+b0
214	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
215	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
216	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
217	pxor	  \GH, \TMP2
218	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
219	movdqa	  \TMP2, \TMP3
220	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
221	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
222	pxor	  \TMP3, \GH
223	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
224
225        # first phase of the reduction
226
227	movdqa    \GH, \TMP2
228	movdqa    \GH, \TMP3
229	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
230					# in in order to perform
231					# independent shifts
232	pslld     $31, \TMP2            # packed right shift <<31
233	pslld     $30, \TMP3            # packed right shift <<30
234	pslld     $25, \TMP4            # packed right shift <<25
235	pxor      \TMP3, \TMP2          # xor the shifted versions
236	pxor      \TMP4, \TMP2
237	movdqa    \TMP2, \TMP5
238	psrldq    $4, \TMP5             # right shift TMP5 1 DW
239	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
240	pxor      \TMP2, \GH
241
242        # second phase of the reduction
243
244	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
245					# in in order to perform
246					# independent shifts
247	movdqa    \GH,\TMP3
248	movdqa    \GH,\TMP4
249	psrld     $1,\TMP2              # packed left shift >>1
250	psrld     $2,\TMP3              # packed left shift >>2
251	psrld     $7,\TMP4              # packed left shift >>7
252	pxor      \TMP3,\TMP2		# xor the shifted versions
253	pxor      \TMP4,\TMP2
254	pxor      \TMP5, \TMP2
255	pxor      \TMP2, \GH
256	pxor      \TMP1, \GH            # result is in TMP1
257.endm
258
259/*
260* if a = number of total plaintext bytes
261* b = floor(a/16)
262* num_initial_blocks = b mod 4
263* encrypt the initial num_initial_blocks blocks and apply ghash on
264* the ciphertext
265* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
266* are clobbered
267* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
268*/
269
270
271.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
272XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
273        MOVADQ     SHUF_MASK(%rip), %xmm14
274	mov	   arg7, %r10           # %r10 = AAD
275	mov	   arg8, %r12           # %r12 = aadLen
276	mov	   %r12, %r11
277	pxor	   %xmm\i, %xmm\i
278	pxor       \XMM2, \XMM2
279
280	cmp	   $16, %r11
281	jl	   _get_AAD_rest8\num_initial_blocks\operation
282_get_AAD_blocks\num_initial_blocks\operation:
283	movdqu	   (%r10), %xmm\i
284	PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
285	pxor	   %xmm\i, \XMM2
286	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287	add	   $16, %r10
288	sub	   $16, %r12
289	sub	   $16, %r11
290	cmp	   $16, %r11
291	jge	   _get_AAD_blocks\num_initial_blocks\operation
292
293	movdqu	   \XMM2, %xmm\i
294	cmp	   $0, %r11
295	je	   _get_AAD_done\num_initial_blocks\operation
296
297	pxor	   %xmm\i,%xmm\i
298
299	/* read the last <16B of AAD. since we have at least 4B of
300	data right after the AAD (the ICV, and maybe some CT), we can
301	read 4B/8B blocks safely, and then get rid of the extra stuff */
302_get_AAD_rest8\num_initial_blocks\operation:
303	cmp	   $4, %r11
304	jle	   _get_AAD_rest4\num_initial_blocks\operation
305	movq	   (%r10), \TMP1
306	add	   $8, %r10
307	sub	   $8, %r11
308	pslldq	   $8, \TMP1
309	psrldq	   $8, %xmm\i
310	pxor	   \TMP1, %xmm\i
311	jmp	   _get_AAD_rest8\num_initial_blocks\operation
312_get_AAD_rest4\num_initial_blocks\operation:
313	cmp	   $0, %r11
314	jle	   _get_AAD_rest0\num_initial_blocks\operation
315	mov	   (%r10), %eax
316	movq	   %rax, \TMP1
317	add	   $4, %r10
318	sub	   $4, %r10
319	pslldq	   $12, \TMP1
320	psrldq	   $4, %xmm\i
321	pxor	   \TMP1, %xmm\i
322_get_AAD_rest0\num_initial_blocks\operation:
323	/* finalize: shift out the extra bytes we read, and align
324	left. since pslldq can only shift by an immediate, we use
325	vpshufb and an array of shuffle masks */
326	movq	   %r12, %r11
327	salq	   $4, %r11
328	movdqu	   aad_shift_arr(%r11), \TMP1
329	PSHUFB_XMM \TMP1, %xmm\i
330_get_AAD_rest_final\num_initial_blocks\operation:
331	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
332	pxor	   \XMM2, %xmm\i
333	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
334
335_get_AAD_done\num_initial_blocks\operation:
336	xor	   %r11, %r11 # initialise the data pointer offset as zero
337	# start AES for num_initial_blocks blocks
338
339	mov	   %arg5, %rax                      # %rax = *Y0
340	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
341	PSHUFB_XMM   %xmm14, \XMM0
342
343.if (\i == 5) || (\i == 6) || (\i == 7)
344	MOVADQ		ONE(%RIP),\TMP1
345	MOVADQ		(%arg1),\TMP2
346.irpc index, \i_seq
347	paddd	   \TMP1, \XMM0                 # INCR Y0
348	movdqa	   \XMM0, %xmm\index
349	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
350	pxor	   \TMP2, %xmm\index
351.endr
352	lea	0x10(%arg1),%r10
353	mov	keysize,%eax
354	shr	$2,%eax				# 128->4, 192->6, 256->8
355	add	$5,%eax			      # 128->9, 192->11, 256->13
356
357aes_loop_initial_dec\num_initial_blocks:
358	MOVADQ	(%r10),\TMP1
359.irpc	index, \i_seq
360	AESENC	\TMP1, %xmm\index
361.endr
362	add	$16,%r10
363	sub	$1,%eax
364	jnz	aes_loop_initial_dec\num_initial_blocks
365
366	MOVADQ	(%r10), \TMP1
367.irpc index, \i_seq
368	AESENCLAST \TMP1, %xmm\index         # Last Round
369.endr
370.irpc index, \i_seq
371	movdqu	   (%arg3 , %r11, 1), \TMP1
372	pxor	   \TMP1, %xmm\index
373	movdqu	   %xmm\index, (%arg2 , %r11, 1)
374	# write back plaintext/ciphertext for num_initial_blocks
375	add	   $16, %r11
376
377	movdqa     \TMP1, %xmm\index
378	PSHUFB_XMM	   %xmm14, %xmm\index
379                # prepare plaintext/ciphertext for GHASH computation
380.endr
381.endif
382
383        # apply GHASH on num_initial_blocks blocks
384
385.if \i == 5
386        pxor       %xmm5, %xmm6
387	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
388        pxor       %xmm6, %xmm7
389	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
390        pxor       %xmm7, %xmm8
391	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
392.elseif \i == 6
393        pxor       %xmm6, %xmm7
394	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
395        pxor       %xmm7, %xmm8
396	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
397.elseif \i == 7
398        pxor       %xmm7, %xmm8
399	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
400.endif
401	cmp	   $64, %r13
402	jl	_initial_blocks_done\num_initial_blocks\operation
403	# no need for precomputed values
404/*
405*
406* Precomputations for HashKey parallel with encryption of first 4 blocks.
407* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
408*/
409	MOVADQ	   ONE(%rip), \TMP1
410	paddd	   \TMP1, \XMM0              # INCR Y0
411	MOVADQ	   \XMM0, \XMM1
412	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
413
414	paddd	   \TMP1, \XMM0              # INCR Y0
415	MOVADQ	   \XMM0, \XMM2
416	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
417
418	paddd	   \TMP1, \XMM0              # INCR Y0
419	MOVADQ	   \XMM0, \XMM3
420	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
421
422	paddd	   \TMP1, \XMM0              # INCR Y0
423	MOVADQ	   \XMM0, \XMM4
424	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
425
426	MOVADQ	   0(%arg1),\TMP1
427	pxor	   \TMP1, \XMM1
428	pxor	   \TMP1, \XMM2
429	pxor	   \TMP1, \XMM3
430	pxor	   \TMP1, \XMM4
431	movdqa	   \TMP3, \TMP5
432	pshufd	   $78, \TMP3, \TMP1
433	pxor	   \TMP3, \TMP1
434	movdqa	   \TMP1, HashKey_k(%rsp)
435	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
436# TMP5 = HashKey^2<<1 (mod poly)
437	movdqa	   \TMP5, HashKey_2(%rsp)
438# HashKey_2 = HashKey^2<<1 (mod poly)
439	pshufd	   $78, \TMP5, \TMP1
440	pxor	   \TMP5, \TMP1
441	movdqa	   \TMP1, HashKey_2_k(%rsp)
442.irpc index, 1234 # do 4 rounds
443	movaps 0x10*\index(%arg1), \TMP1
444	AESENC	   \TMP1, \XMM1
445	AESENC	   \TMP1, \XMM2
446	AESENC	   \TMP1, \XMM3
447	AESENC	   \TMP1, \XMM4
448.endr
449	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
450# TMP5 = HashKey^3<<1 (mod poly)
451	movdqa	   \TMP5, HashKey_3(%rsp)
452	pshufd	   $78, \TMP5, \TMP1
453	pxor	   \TMP5, \TMP1
454	movdqa	   \TMP1, HashKey_3_k(%rsp)
455.irpc index, 56789 # do next 5 rounds
456	movaps 0x10*\index(%arg1), \TMP1
457	AESENC	   \TMP1, \XMM1
458	AESENC	   \TMP1, \XMM2
459	AESENC	   \TMP1, \XMM3
460	AESENC	   \TMP1, \XMM4
461.endr
462	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
463# TMP5 = HashKey^3<<1 (mod poly)
464	movdqa	   \TMP5, HashKey_4(%rsp)
465	pshufd	   $78, \TMP5, \TMP1
466	pxor	   \TMP5, \TMP1
467	movdqa	   \TMP1, HashKey_4_k(%rsp)
468	lea	   0xa0(%arg1),%r10
469	mov	   keysize,%eax
470	shr	   $2,%eax			# 128->4, 192->6, 256->8
471	sub	   $4,%eax			# 128->0, 192->2, 256->4
472	jz	   aes_loop_pre_dec_done\num_initial_blocks
473
474aes_loop_pre_dec\num_initial_blocks:
475	MOVADQ	   (%r10),\TMP2
476.irpc	index, 1234
477	AESENC	   \TMP2, %xmm\index
478.endr
479	add	   $16,%r10
480	sub	   $1,%eax
481	jnz	   aes_loop_pre_dec\num_initial_blocks
482
483aes_loop_pre_dec_done\num_initial_blocks:
484	MOVADQ	   (%r10), \TMP2
485	AESENCLAST \TMP2, \XMM1
486	AESENCLAST \TMP2, \XMM2
487	AESENCLAST \TMP2, \XMM3
488	AESENCLAST \TMP2, \XMM4
489	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
490	pxor	   \TMP1, \XMM1
491	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
492	movdqa     \TMP1, \XMM1
493	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
494	pxor	   \TMP1, \XMM2
495	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
496	movdqa     \TMP1, \XMM2
497	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
498	pxor	   \TMP1, \XMM3
499	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
500	movdqa     \TMP1, \XMM3
501	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
502	pxor	   \TMP1, \XMM4
503	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
504	movdqa     \TMP1, \XMM4
505	add	   $64, %r11
506	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
507	pxor	   \XMMDst, \XMM1
508# combine GHASHed value with the corresponding ciphertext
509	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
510	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
511	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
512
513_initial_blocks_done\num_initial_blocks\operation:
514
515.endm
516
517
518/*
519* if a = number of total plaintext bytes
520* b = floor(a/16)
521* num_initial_blocks = b mod 4
522* encrypt the initial num_initial_blocks blocks and apply ghash on
523* the ciphertext
524* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
525* are clobbered
526* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
527*/
528
529
530.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
531XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
532        MOVADQ     SHUF_MASK(%rip), %xmm14
533	mov	   arg7, %r10           # %r10 = AAD
534	mov	   arg8, %r12           # %r12 = aadLen
535	mov	   %r12, %r11
536	pxor	   %xmm\i, %xmm\i
537	pxor	   \XMM2, \XMM2
538
539	cmp	   $16, %r11
540	jl	   _get_AAD_rest8\num_initial_blocks\operation
541_get_AAD_blocks\num_initial_blocks\operation:
542	movdqu	   (%r10), %xmm\i
543	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
544	pxor	   %xmm\i, \XMM2
545	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546	add	   $16, %r10
547	sub	   $16, %r12
548	sub	   $16, %r11
549	cmp	   $16, %r11
550	jge	   _get_AAD_blocks\num_initial_blocks\operation
551
552	movdqu	   \XMM2, %xmm\i
553	cmp	   $0, %r11
554	je	   _get_AAD_done\num_initial_blocks\operation
555
556	pxor	   %xmm\i,%xmm\i
557
558	/* read the last <16B of AAD. since we have at least 4B of
559	data right after the AAD (the ICV, and maybe some PT), we can
560	read 4B/8B blocks safely, and then get rid of the extra stuff */
561_get_AAD_rest8\num_initial_blocks\operation:
562	cmp	   $4, %r11
563	jle	   _get_AAD_rest4\num_initial_blocks\operation
564	movq	   (%r10), \TMP1
565	add	   $8, %r10
566	sub	   $8, %r11
567	pslldq	   $8, \TMP1
568	psrldq	   $8, %xmm\i
569	pxor	   \TMP1, %xmm\i
570	jmp	   _get_AAD_rest8\num_initial_blocks\operation
571_get_AAD_rest4\num_initial_blocks\operation:
572	cmp	   $0, %r11
573	jle	   _get_AAD_rest0\num_initial_blocks\operation
574	mov	   (%r10), %eax
575	movq	   %rax, \TMP1
576	add	   $4, %r10
577	sub	   $4, %r10
578	pslldq	   $12, \TMP1
579	psrldq	   $4, %xmm\i
580	pxor	   \TMP1, %xmm\i
581_get_AAD_rest0\num_initial_blocks\operation:
582	/* finalize: shift out the extra bytes we read, and align
583	left. since pslldq can only shift by an immediate, we use
584	vpshufb and an array of shuffle masks */
585	movq	   %r12, %r11
586	salq	   $4, %r11
587	movdqu	   aad_shift_arr(%r11), \TMP1
588	PSHUFB_XMM \TMP1, %xmm\i
589_get_AAD_rest_final\num_initial_blocks\operation:
590	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
591	pxor	   \XMM2, %xmm\i
592	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
593
594_get_AAD_done\num_initial_blocks\operation:
595	xor	   %r11, %r11 # initialise the data pointer offset as zero
596	# start AES for num_initial_blocks blocks
597
598	mov	   %arg5, %rax                      # %rax = *Y0
599	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
600	PSHUFB_XMM   %xmm14, \XMM0
601
602.if (\i == 5) || (\i == 6) || (\i == 7)
603
604	MOVADQ		ONE(%RIP),\TMP1
605	MOVADQ		0(%arg1),\TMP2
606.irpc index, \i_seq
607	paddd		\TMP1, \XMM0                 # INCR Y0
608	MOVADQ		\XMM0, %xmm\index
609	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
610	pxor		\TMP2, %xmm\index
611.endr
612	lea	0x10(%arg1),%r10
613	mov	keysize,%eax
614	shr	$2,%eax				# 128->4, 192->6, 256->8
615	add	$5,%eax			      # 128->9, 192->11, 256->13
616
617aes_loop_initial_enc\num_initial_blocks:
618	MOVADQ	(%r10),\TMP1
619.irpc	index, \i_seq
620	AESENC	\TMP1, %xmm\index
621.endr
622	add	$16,%r10
623	sub	$1,%eax
624	jnz	aes_loop_initial_enc\num_initial_blocks
625
626	MOVADQ	(%r10), \TMP1
627.irpc index, \i_seq
628	AESENCLAST \TMP1, %xmm\index         # Last Round
629.endr
630.irpc index, \i_seq
631	movdqu	   (%arg3 , %r11, 1), \TMP1
632	pxor	   \TMP1, %xmm\index
633	movdqu	   %xmm\index, (%arg2 , %r11, 1)
634	# write back plaintext/ciphertext for num_initial_blocks
635	add	   $16, %r11
636	PSHUFB_XMM	   %xmm14, %xmm\index
637
638		# prepare plaintext/ciphertext for GHASH computation
639.endr
640.endif
641
642        # apply GHASH on num_initial_blocks blocks
643
644.if \i == 5
645        pxor       %xmm5, %xmm6
646	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
647        pxor       %xmm6, %xmm7
648	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
649        pxor       %xmm7, %xmm8
650	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
651.elseif \i == 6
652        pxor       %xmm6, %xmm7
653	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
654        pxor       %xmm7, %xmm8
655	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
656.elseif \i == 7
657        pxor       %xmm7, %xmm8
658	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
659.endif
660	cmp	   $64, %r13
661	jl	_initial_blocks_done\num_initial_blocks\operation
662	# no need for precomputed values
663/*
664*
665* Precomputations for HashKey parallel with encryption of first 4 blocks.
666* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
667*/
668	MOVADQ	   ONE(%RIP),\TMP1
669	paddd	   \TMP1, \XMM0              # INCR Y0
670	MOVADQ	   \XMM0, \XMM1
671	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
672
673	paddd	   \TMP1, \XMM0              # INCR Y0
674	MOVADQ	   \XMM0, \XMM2
675	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
676
677	paddd	   \TMP1, \XMM0              # INCR Y0
678	MOVADQ	   \XMM0, \XMM3
679	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
680
681	paddd	   \TMP1, \XMM0              # INCR Y0
682	MOVADQ	   \XMM0, \XMM4
683	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
684
685	MOVADQ	   0(%arg1),\TMP1
686	pxor	   \TMP1, \XMM1
687	pxor	   \TMP1, \XMM2
688	pxor	   \TMP1, \XMM3
689	pxor	   \TMP1, \XMM4
690	movdqa	   \TMP3, \TMP5
691	pshufd	   $78, \TMP3, \TMP1
692	pxor	   \TMP3, \TMP1
693	movdqa	   \TMP1, HashKey_k(%rsp)
694	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
695# TMP5 = HashKey^2<<1 (mod poly)
696	movdqa	   \TMP5, HashKey_2(%rsp)
697# HashKey_2 = HashKey^2<<1 (mod poly)
698	pshufd	   $78, \TMP5, \TMP1
699	pxor	   \TMP5, \TMP1
700	movdqa	   \TMP1, HashKey_2_k(%rsp)
701.irpc index, 1234 # do 4 rounds
702	movaps 0x10*\index(%arg1), \TMP1
703	AESENC	   \TMP1, \XMM1
704	AESENC	   \TMP1, \XMM2
705	AESENC	   \TMP1, \XMM3
706	AESENC	   \TMP1, \XMM4
707.endr
708	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
709# TMP5 = HashKey^3<<1 (mod poly)
710	movdqa	   \TMP5, HashKey_3(%rsp)
711	pshufd	   $78, \TMP5, \TMP1
712	pxor	   \TMP5, \TMP1
713	movdqa	   \TMP1, HashKey_3_k(%rsp)
714.irpc index, 56789 # do next 5 rounds
715	movaps 0x10*\index(%arg1), \TMP1
716	AESENC	   \TMP1, \XMM1
717	AESENC	   \TMP1, \XMM2
718	AESENC	   \TMP1, \XMM3
719	AESENC	   \TMP1, \XMM4
720.endr
721	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
722# TMP5 = HashKey^3<<1 (mod poly)
723	movdqa	   \TMP5, HashKey_4(%rsp)
724	pshufd	   $78, \TMP5, \TMP1
725	pxor	   \TMP5, \TMP1
726	movdqa	   \TMP1, HashKey_4_k(%rsp)
727	lea	   0xa0(%arg1),%r10
728	mov	   keysize,%eax
729	shr	   $2,%eax			# 128->4, 192->6, 256->8
730	sub	   $4,%eax			# 128->0, 192->2, 256->4
731	jz	   aes_loop_pre_enc_done\num_initial_blocks
732
733aes_loop_pre_enc\num_initial_blocks:
734	MOVADQ	   (%r10),\TMP2
735.irpc	index, 1234
736	AESENC	   \TMP2, %xmm\index
737.endr
738	add	   $16,%r10
739	sub	   $1,%eax
740	jnz	   aes_loop_pre_enc\num_initial_blocks
741
742aes_loop_pre_enc_done\num_initial_blocks:
743	MOVADQ	   (%r10), \TMP2
744	AESENCLAST \TMP2, \XMM1
745	AESENCLAST \TMP2, \XMM2
746	AESENCLAST \TMP2, \XMM3
747	AESENCLAST \TMP2, \XMM4
748	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
749	pxor	   \TMP1, \XMM1
750	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
751	pxor	   \TMP1, \XMM2
752	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
753	pxor	   \TMP1, \XMM3
754	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
755	pxor	   \TMP1, \XMM4
756	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
757	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
758	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
759	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
760
761	add	   $64, %r11
762	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
763	pxor	   \XMMDst, \XMM1
764# combine GHASHed value with the corresponding ciphertext
765	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
766	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
767	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
768
769_initial_blocks_done\num_initial_blocks\operation:
770
771.endm
772
773/*
774* encrypt 4 blocks at a time
775* ghash the 4 previously encrypted ciphertext blocks
776* arg1, %arg2, %arg3 are used as pointers only, not modified
777* %r11 is the data offset value
778*/
779.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
780TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
781
782	movdqa	  \XMM1, \XMM5
783	movdqa	  \XMM2, \XMM6
784	movdqa	  \XMM3, \XMM7
785	movdqa	  \XMM4, \XMM8
786
787        movdqa    SHUF_MASK(%rip), %xmm15
788        # multiply TMP5 * HashKey using karatsuba
789
790	movdqa	  \XMM5, \TMP4
791	pshufd	  $78, \XMM5, \TMP6
792	pxor	  \XMM5, \TMP6
793	paddd     ONE(%rip), \XMM0		# INCR CNT
794	movdqa	  HashKey_4(%rsp), \TMP5
795	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
796	movdqa    \XMM0, \XMM1
797	paddd     ONE(%rip), \XMM0		# INCR CNT
798	movdqa    \XMM0, \XMM2
799	paddd     ONE(%rip), \XMM0		# INCR CNT
800	movdqa    \XMM0, \XMM3
801	paddd     ONE(%rip), \XMM0		# INCR CNT
802	movdqa    \XMM0, \XMM4
803	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
804	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
805	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
806	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
807	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
808
809	pxor	  (%arg1), \XMM1
810	pxor	  (%arg1), \XMM2
811	pxor	  (%arg1), \XMM3
812	pxor	  (%arg1), \XMM4
813	movdqa	  HashKey_4_k(%rsp), \TMP5
814	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
815	movaps 0x10(%arg1), \TMP1
816	AESENC	  \TMP1, \XMM1              # Round 1
817	AESENC	  \TMP1, \XMM2
818	AESENC	  \TMP1, \XMM3
819	AESENC	  \TMP1, \XMM4
820	movaps 0x20(%arg1), \TMP1
821	AESENC	  \TMP1, \XMM1              # Round 2
822	AESENC	  \TMP1, \XMM2
823	AESENC	  \TMP1, \XMM3
824	AESENC	  \TMP1, \XMM4
825	movdqa	  \XMM6, \TMP1
826	pshufd	  $78, \XMM6, \TMP2
827	pxor	  \XMM6, \TMP2
828	movdqa	  HashKey_3(%rsp), \TMP5
829	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
830	movaps 0x30(%arg1), \TMP3
831	AESENC    \TMP3, \XMM1              # Round 3
832	AESENC    \TMP3, \XMM2
833	AESENC    \TMP3, \XMM3
834	AESENC    \TMP3, \XMM4
835	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
836	movaps 0x40(%arg1), \TMP3
837	AESENC	  \TMP3, \XMM1              # Round 4
838	AESENC	  \TMP3, \XMM2
839	AESENC	  \TMP3, \XMM3
840	AESENC	  \TMP3, \XMM4
841	movdqa	  HashKey_3_k(%rsp), \TMP5
842	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
843	movaps 0x50(%arg1), \TMP3
844	AESENC	  \TMP3, \XMM1              # Round 5
845	AESENC	  \TMP3, \XMM2
846	AESENC	  \TMP3, \XMM3
847	AESENC	  \TMP3, \XMM4
848	pxor	  \TMP1, \TMP4
849# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
850	pxor	  \XMM6, \XMM5
851	pxor	  \TMP2, \TMP6
852	movdqa	  \XMM7, \TMP1
853	pshufd	  $78, \XMM7, \TMP2
854	pxor	  \XMM7, \TMP2
855	movdqa	  HashKey_2(%rsp ), \TMP5
856
857        # Multiply TMP5 * HashKey using karatsuba
858
859	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
860	movaps 0x60(%arg1), \TMP3
861	AESENC	  \TMP3, \XMM1              # Round 6
862	AESENC	  \TMP3, \XMM2
863	AESENC	  \TMP3, \XMM3
864	AESENC	  \TMP3, \XMM4
865	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
866	movaps 0x70(%arg1), \TMP3
867	AESENC	  \TMP3, \XMM1             # Round 7
868	AESENC	  \TMP3, \XMM2
869	AESENC	  \TMP3, \XMM3
870	AESENC	  \TMP3, \XMM4
871	movdqa	  HashKey_2_k(%rsp), \TMP5
872	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
873	movaps 0x80(%arg1), \TMP3
874	AESENC	  \TMP3, \XMM1             # Round 8
875	AESENC	  \TMP3, \XMM2
876	AESENC	  \TMP3, \XMM3
877	AESENC	  \TMP3, \XMM4
878	pxor	  \TMP1, \TMP4
879# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
880	pxor	  \XMM7, \XMM5
881	pxor	  \TMP2, \TMP6
882
883        # Multiply XMM8 * HashKey
884        # XMM8 and TMP5 hold the values for the two operands
885
886	movdqa	  \XMM8, \TMP1
887	pshufd	  $78, \XMM8, \TMP2
888	pxor	  \XMM8, \TMP2
889	movdqa	  HashKey(%rsp), \TMP5
890	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
891	movaps 0x90(%arg1), \TMP3
892	AESENC	  \TMP3, \XMM1            # Round 9
893	AESENC	  \TMP3, \XMM2
894	AESENC	  \TMP3, \XMM3
895	AESENC	  \TMP3, \XMM4
896	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
897	lea	  0xa0(%arg1),%r10
898	mov	  keysize,%eax
899	shr	  $2,%eax			# 128->4, 192->6, 256->8
900	sub	  $4,%eax			# 128->0, 192->2, 256->4
901	jz	  aes_loop_par_enc_done
902
903aes_loop_par_enc:
904	MOVADQ	  (%r10),\TMP3
905.irpc	index, 1234
906	AESENC	  \TMP3, %xmm\index
907.endr
908	add	  $16,%r10
909	sub	  $1,%eax
910	jnz	  aes_loop_par_enc
911
912aes_loop_par_enc_done:
913	MOVADQ	  (%r10), \TMP3
914	AESENCLAST \TMP3, \XMM1           # Round 10
915	AESENCLAST \TMP3, \XMM2
916	AESENCLAST \TMP3, \XMM3
917	AESENCLAST \TMP3, \XMM4
918	movdqa    HashKey_k(%rsp), \TMP5
919	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
920	movdqu	  (%arg3,%r11,1), \TMP3
921	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
922	movdqu	  16(%arg3,%r11,1), \TMP3
923	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
924	movdqu	  32(%arg3,%r11,1), \TMP3
925	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
926	movdqu	  48(%arg3,%r11,1), \TMP3
927	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
928        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
929        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
930        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
931        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
932	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
933	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
934	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
935	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
936
937	pxor	  \TMP4, \TMP1
938	pxor	  \XMM8, \XMM5
939	pxor	  \TMP6, \TMP2
940	pxor	  \TMP1, \TMP2
941	pxor	  \XMM5, \TMP2
942	movdqa	  \TMP2, \TMP3
943	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
944	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
945	pxor	  \TMP3, \XMM5
946	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
947
948        # first phase of reduction
949
950	movdqa    \XMM5, \TMP2
951	movdqa    \XMM5, \TMP3
952	movdqa    \XMM5, \TMP4
953# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
954	pslld     $31, \TMP2                   # packed right shift << 31
955	pslld     $30, \TMP3                   # packed right shift << 30
956	pslld     $25, \TMP4                   # packed right shift << 25
957	pxor      \TMP3, \TMP2	               # xor the shifted versions
958	pxor      \TMP4, \TMP2
959	movdqa    \TMP2, \TMP5
960	psrldq    $4, \TMP5                    # right shift T5 1 DW
961	pslldq    $12, \TMP2                   # left shift T2 3 DWs
962	pxor      \TMP2, \XMM5
963
964        # second phase of reduction
965
966	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
967	movdqa    \XMM5,\TMP3
968	movdqa    \XMM5,\TMP4
969	psrld     $1, \TMP2                    # packed left shift >>1
970	psrld     $2, \TMP3                    # packed left shift >>2
971	psrld     $7, \TMP4                    # packed left shift >>7
972	pxor      \TMP3,\TMP2		       # xor the shifted versions
973	pxor      \TMP4,\TMP2
974	pxor      \TMP5, \TMP2
975	pxor      \TMP2, \XMM5
976	pxor      \TMP1, \XMM5                 # result is in TMP1
977
978	pxor	  \XMM5, \XMM1
979.endm
980
981/*
982* decrypt 4 blocks at a time
983* ghash the 4 previously decrypted ciphertext blocks
984* arg1, %arg2, %arg3 are used as pointers only, not modified
985* %r11 is the data offset value
986*/
987.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
988TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
989
990	movdqa	  \XMM1, \XMM5
991	movdqa	  \XMM2, \XMM6
992	movdqa	  \XMM3, \XMM7
993	movdqa	  \XMM4, \XMM8
994
995        movdqa    SHUF_MASK(%rip), %xmm15
996        # multiply TMP5 * HashKey using karatsuba
997
998	movdqa	  \XMM5, \TMP4
999	pshufd	  $78, \XMM5, \TMP6
1000	pxor	  \XMM5, \TMP6
1001	paddd     ONE(%rip), \XMM0		# INCR CNT
1002	movdqa	  HashKey_4(%rsp), \TMP5
1003	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1004	movdqa    \XMM0, \XMM1
1005	paddd     ONE(%rip), \XMM0		# INCR CNT
1006	movdqa    \XMM0, \XMM2
1007	paddd     ONE(%rip), \XMM0		# INCR CNT
1008	movdqa    \XMM0, \XMM3
1009	paddd     ONE(%rip), \XMM0		# INCR CNT
1010	movdqa    \XMM0, \XMM4
1011	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1012	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1013	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1014	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1015	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1016
1017	pxor	  (%arg1), \XMM1
1018	pxor	  (%arg1), \XMM2
1019	pxor	  (%arg1), \XMM3
1020	pxor	  (%arg1), \XMM4
1021	movdqa	  HashKey_4_k(%rsp), \TMP5
1022	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1023	movaps 0x10(%arg1), \TMP1
1024	AESENC	  \TMP1, \XMM1              # Round 1
1025	AESENC	  \TMP1, \XMM2
1026	AESENC	  \TMP1, \XMM3
1027	AESENC	  \TMP1, \XMM4
1028	movaps 0x20(%arg1), \TMP1
1029	AESENC	  \TMP1, \XMM1              # Round 2
1030	AESENC	  \TMP1, \XMM2
1031	AESENC	  \TMP1, \XMM3
1032	AESENC	  \TMP1, \XMM4
1033	movdqa	  \XMM6, \TMP1
1034	pshufd	  $78, \XMM6, \TMP2
1035	pxor	  \XMM6, \TMP2
1036	movdqa	  HashKey_3(%rsp), \TMP5
1037	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1038	movaps 0x30(%arg1), \TMP3
1039	AESENC    \TMP3, \XMM1              # Round 3
1040	AESENC    \TMP3, \XMM2
1041	AESENC    \TMP3, \XMM3
1042	AESENC    \TMP3, \XMM4
1043	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1044	movaps 0x40(%arg1), \TMP3
1045	AESENC	  \TMP3, \XMM1              # Round 4
1046	AESENC	  \TMP3, \XMM2
1047	AESENC	  \TMP3, \XMM3
1048	AESENC	  \TMP3, \XMM4
1049	movdqa	  HashKey_3_k(%rsp), \TMP5
1050	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1051	movaps 0x50(%arg1), \TMP3
1052	AESENC	  \TMP3, \XMM1              # Round 5
1053	AESENC	  \TMP3, \XMM2
1054	AESENC	  \TMP3, \XMM3
1055	AESENC	  \TMP3, \XMM4
1056	pxor	  \TMP1, \TMP4
1057# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1058	pxor	  \XMM6, \XMM5
1059	pxor	  \TMP2, \TMP6
1060	movdqa	  \XMM7, \TMP1
1061	pshufd	  $78, \XMM7, \TMP2
1062	pxor	  \XMM7, \TMP2
1063	movdqa	  HashKey_2(%rsp ), \TMP5
1064
1065        # Multiply TMP5 * HashKey using karatsuba
1066
1067	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1068	movaps 0x60(%arg1), \TMP3
1069	AESENC	  \TMP3, \XMM1              # Round 6
1070	AESENC	  \TMP3, \XMM2
1071	AESENC	  \TMP3, \XMM3
1072	AESENC	  \TMP3, \XMM4
1073	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1074	movaps 0x70(%arg1), \TMP3
1075	AESENC	  \TMP3, \XMM1             # Round 7
1076	AESENC	  \TMP3, \XMM2
1077	AESENC	  \TMP3, \XMM3
1078	AESENC	  \TMP3, \XMM4
1079	movdqa	  HashKey_2_k(%rsp), \TMP5
1080	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1081	movaps 0x80(%arg1), \TMP3
1082	AESENC	  \TMP3, \XMM1             # Round 8
1083	AESENC	  \TMP3, \XMM2
1084	AESENC	  \TMP3, \XMM3
1085	AESENC	  \TMP3, \XMM4
1086	pxor	  \TMP1, \TMP4
1087# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1088	pxor	  \XMM7, \XMM5
1089	pxor	  \TMP2, \TMP6
1090
1091        # Multiply XMM8 * HashKey
1092        # XMM8 and TMP5 hold the values for the two operands
1093
1094	movdqa	  \XMM8, \TMP1
1095	pshufd	  $78, \XMM8, \TMP2
1096	pxor	  \XMM8, \TMP2
1097	movdqa	  HashKey(%rsp), \TMP5
1098	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1099	movaps 0x90(%arg1), \TMP3
1100	AESENC	  \TMP3, \XMM1            # Round 9
1101	AESENC	  \TMP3, \XMM2
1102	AESENC	  \TMP3, \XMM3
1103	AESENC	  \TMP3, \XMM4
1104	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1105	lea	  0xa0(%arg1),%r10
1106	mov	  keysize,%eax
1107	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1108	sub	  $4,%eax			# 128->0, 192->2, 256->4
1109	jz	  aes_loop_par_dec_done
1110
1111aes_loop_par_dec:
1112	MOVADQ	  (%r10),\TMP3
1113.irpc	index, 1234
1114	AESENC	  \TMP3, %xmm\index
1115.endr
1116	add	  $16,%r10
1117	sub	  $1,%eax
1118	jnz	  aes_loop_par_dec
1119
1120aes_loop_par_dec_done:
1121	MOVADQ	  (%r10), \TMP3
1122	AESENCLAST \TMP3, \XMM1           # last round
1123	AESENCLAST \TMP3, \XMM2
1124	AESENCLAST \TMP3, \XMM3
1125	AESENCLAST \TMP3, \XMM4
1126	movdqa    HashKey_k(%rsp), \TMP5
1127	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1128	movdqu	  (%arg3,%r11,1), \TMP3
1129	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1130	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
1131	movdqa    \TMP3, \XMM1
1132	movdqu	  16(%arg3,%r11,1), \TMP3
1133	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1134	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
1135	movdqa    \TMP3, \XMM2
1136	movdqu	  32(%arg3,%r11,1), \TMP3
1137	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1138	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
1139	movdqa    \TMP3, \XMM3
1140	movdqu	  48(%arg3,%r11,1), \TMP3
1141	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1142	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
1143	movdqa    \TMP3, \XMM4
1144	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1145	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1146	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1147	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1148
1149	pxor	  \TMP4, \TMP1
1150	pxor	  \XMM8, \XMM5
1151	pxor	  \TMP6, \TMP2
1152	pxor	  \TMP1, \TMP2
1153	pxor	  \XMM5, \TMP2
1154	movdqa	  \TMP2, \TMP3
1155	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1156	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1157	pxor	  \TMP3, \XMM5
1158	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1159
1160        # first phase of reduction
1161
1162	movdqa    \XMM5, \TMP2
1163	movdqa    \XMM5, \TMP3
1164	movdqa    \XMM5, \TMP4
1165# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1166	pslld     $31, \TMP2                   # packed right shift << 31
1167	pslld     $30, \TMP3                   # packed right shift << 30
1168	pslld     $25, \TMP4                   # packed right shift << 25
1169	pxor      \TMP3, \TMP2	               # xor the shifted versions
1170	pxor      \TMP4, \TMP2
1171	movdqa    \TMP2, \TMP5
1172	psrldq    $4, \TMP5                    # right shift T5 1 DW
1173	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1174	pxor      \TMP2, \XMM5
1175
1176        # second phase of reduction
1177
1178	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1179	movdqa    \XMM5,\TMP3
1180	movdqa    \XMM5,\TMP4
1181	psrld     $1, \TMP2                    # packed left shift >>1
1182	psrld     $2, \TMP3                    # packed left shift >>2
1183	psrld     $7, \TMP4                    # packed left shift >>7
1184	pxor      \TMP3,\TMP2		       # xor the shifted versions
1185	pxor      \TMP4,\TMP2
1186	pxor      \TMP5, \TMP2
1187	pxor      \TMP2, \XMM5
1188	pxor      \TMP1, \XMM5                 # result is in TMP1
1189
1190	pxor	  \XMM5, \XMM1
1191.endm
1192
1193/* GHASH the last 4 ciphertext blocks. */
1194.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1195TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1196
1197        # Multiply TMP6 * HashKey (using Karatsuba)
1198
1199	movdqa	  \XMM1, \TMP6
1200	pshufd	  $78, \XMM1, \TMP2
1201	pxor	  \XMM1, \TMP2
1202	movdqa	  HashKey_4(%rsp), \TMP5
1203	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1204	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1205	movdqa	  HashKey_4_k(%rsp), \TMP4
1206	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1207	movdqa	  \XMM1, \XMMDst
1208	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1209
1210        # Multiply TMP1 * HashKey (using Karatsuba)
1211
1212	movdqa	  \XMM2, \TMP1
1213	pshufd	  $78, \XMM2, \TMP2
1214	pxor	  \XMM2, \TMP2
1215	movdqa	  HashKey_3(%rsp), \TMP5
1216	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1217	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1218	movdqa	  HashKey_3_k(%rsp), \TMP4
1219	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1220	pxor	  \TMP1, \TMP6
1221	pxor	  \XMM2, \XMMDst
1222	pxor	  \TMP2, \XMM1
1223# results accumulated in TMP6, XMMDst, XMM1
1224
1225        # Multiply TMP1 * HashKey (using Karatsuba)
1226
1227	movdqa	  \XMM3, \TMP1
1228	pshufd	  $78, \XMM3, \TMP2
1229	pxor	  \XMM3, \TMP2
1230	movdqa	  HashKey_2(%rsp), \TMP5
1231	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1232	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1233	movdqa	  HashKey_2_k(%rsp), \TMP4
1234	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1235	pxor	  \TMP1, \TMP6
1236	pxor	  \XMM3, \XMMDst
1237	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1238
1239        # Multiply TMP1 * HashKey (using Karatsuba)
1240	movdqa	  \XMM4, \TMP1
1241	pshufd	  $78, \XMM4, \TMP2
1242	pxor	  \XMM4, \TMP2
1243	movdqa	  HashKey(%rsp), \TMP5
1244	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1245	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1246	movdqa	  HashKey_k(%rsp), \TMP4
1247	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248	pxor	  \TMP1, \TMP6
1249	pxor	  \XMM4, \XMMDst
1250	pxor	  \XMM1, \TMP2
1251	pxor	  \TMP6, \TMP2
1252	pxor	  \XMMDst, \TMP2
1253	# middle section of the temp results combined as in karatsuba algorithm
1254	movdqa	  \TMP2, \TMP4
1255	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1256	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1257	pxor	  \TMP4, \XMMDst
1258	pxor	  \TMP2, \TMP6
1259# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1260	# first phase of the reduction
1261	movdqa    \XMMDst, \TMP2
1262	movdqa    \XMMDst, \TMP3
1263	movdqa    \XMMDst, \TMP4
1264# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1265	pslld     $31, \TMP2                # packed right shifting << 31
1266	pslld     $30, \TMP3                # packed right shifting << 30
1267	pslld     $25, \TMP4                # packed right shifting << 25
1268	pxor      \TMP3, \TMP2              # xor the shifted versions
1269	pxor      \TMP4, \TMP2
1270	movdqa    \TMP2, \TMP7
1271	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1272	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1273	pxor      \TMP2, \XMMDst
1274
1275        # second phase of the reduction
1276	movdqa    \XMMDst, \TMP2
1277	# make 3 copies of XMMDst for doing 3 shift operations
1278	movdqa    \XMMDst, \TMP3
1279	movdqa    \XMMDst, \TMP4
1280	psrld     $1, \TMP2                 # packed left shift >> 1
1281	psrld     $2, \TMP3                 # packed left shift >> 2
1282	psrld     $7, \TMP4                 # packed left shift >> 7
1283	pxor      \TMP3, \TMP2              # xor the shifted versions
1284	pxor      \TMP4, \TMP2
1285	pxor      \TMP7, \TMP2
1286	pxor      \TMP2, \XMMDst
1287	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1288.endm
1289
1290
1291/* Encryption of a single block
1292* uses eax & r10
1293*/
1294
1295.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1296
1297	pxor		(%arg1), \XMM0
1298	mov		keysize,%eax
1299	shr		$2,%eax			# 128->4, 192->6, 256->8
1300	add		$5,%eax			# 128->9, 192->11, 256->13
1301	lea		16(%arg1), %r10	  # get first expanded key address
1302
1303_esb_loop_\@:
1304	MOVADQ		(%r10),\TMP1
1305	AESENC		\TMP1,\XMM0
1306	add		$16,%r10
1307	sub		$1,%eax
1308	jnz		_esb_loop_\@
1309
1310	MOVADQ		(%r10),\TMP1
1311	AESENCLAST	\TMP1,\XMM0
1312.endm
1313/*****************************************************************************
1314* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1315*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1316*                   const u8 *in,      // Ciphertext input
1317*                   u64 plaintext_len, // Length of data in bytes for decryption.
1318*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1319*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1320*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1321*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1322*                   const u8 *aad,     // Additional Authentication Data (AAD)
1323*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1324*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1325*                                      // given authentication tag and only return the plaintext if they match.
1326*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1327*                                      // (most likely), 12 or 8.
1328*
1329* Assumptions:
1330*
1331* keys:
1332*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1333*       set of 11 keys in the data structure void *aes_ctx
1334*
1335* iv:
1336*       0                   1                   2                   3
1337*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1338*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1339*       |                             Salt  (From the SA)               |
1340*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1341*       |                     Initialization Vector                     |
1342*       |         (This is the sequence number from IPSec header)       |
1343*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1344*       |                              0x1                              |
1345*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1346*
1347*
1348*
1349* AAD:
1350*       AAD padded to 128 bits with 0
1351*       for example, assume AAD is a u32 vector
1352*
1353*       if AAD is 8 bytes:
1354*       AAD[3] = {A0, A1};
1355*       padded AAD in xmm register = {A1 A0 0 0}
1356*
1357*       0                   1                   2                   3
1358*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1359*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1360*       |                               SPI (A1)                        |
1361*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1362*       |                     32-bit Sequence Number (A0)               |
1363*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1364*       |                              0x0                              |
1365*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366*
1367*                                       AAD Format with 32-bit Sequence Number
1368*
1369*       if AAD is 12 bytes:
1370*       AAD[3] = {A0, A1, A2};
1371*       padded AAD in xmm register = {A2 A1 A0 0}
1372*
1373*       0                   1                   2                   3
1374*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1375*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1376*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1377*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1378*       |                               SPI (A2)                        |
1379*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1380*       |                 64-bit Extended Sequence Number {A1,A0}       |
1381*       |                                                               |
1382*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1383*       |                              0x0                              |
1384*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1385*
1386*                        AAD Format with 64-bit Extended Sequence Number
1387*
1388* aadLen:
1389*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1390*       The code supports 16 too but for other sizes, the code will fail.
1391*
1392* TLen:
1393*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1394*       For other sizes, the code will fail.
1395*
1396* poly = x^128 + x^127 + x^126 + x^121 + 1
1397*
1398*****************************************************************************/
1399ENTRY(aesni_gcm_dec)
1400	push	%r12
1401	push	%r13
1402	push	%r14
1403	mov	%rsp, %r14
1404/*
1405* states of %xmm registers %xmm6:%xmm15 not saved
1406* all %xmm registers are clobbered
1407*/
1408	sub	$VARIABLE_OFFSET, %rsp
1409	and	$~63, %rsp                        # align rsp to 64 bytes
1410	mov	%arg6, %r12
1411	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
1412        movdqa  SHUF_MASK(%rip), %xmm2
1413	PSHUFB_XMM %xmm2, %xmm13
1414
1415
1416# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1417
1418	movdqa	%xmm13, %xmm2
1419	psllq	$1, %xmm13
1420	psrlq	$63, %xmm2
1421	movdqa	%xmm2, %xmm1
1422	pslldq	$8, %xmm2
1423	psrldq	$8, %xmm1
1424	por	%xmm2, %xmm13
1425
1426        # Reduction
1427
1428	pshufd	$0x24, %xmm1, %xmm2
1429	pcmpeqd TWOONE(%rip), %xmm2
1430	pand	POLY(%rip), %xmm2
1431	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
1432
1433
1434        # Decrypt first few blocks
1435
1436	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
1437	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
1438	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
1439	mov %r13, %r12
1440	and $(3<<4), %r12
1441	jz _initial_num_blocks_is_0_decrypt
1442	cmp $(2<<4), %r12
1443	jb _initial_num_blocks_is_1_decrypt
1444	je _initial_num_blocks_is_2_decrypt
1445_initial_num_blocks_is_3_decrypt:
1446	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1447%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1448	sub	$48, %r13
1449	jmp	_initial_blocks_decrypted
1450_initial_num_blocks_is_2_decrypt:
1451	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1452%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1453	sub	$32, %r13
1454	jmp	_initial_blocks_decrypted
1455_initial_num_blocks_is_1_decrypt:
1456	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1457%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1458	sub	$16, %r13
1459	jmp	_initial_blocks_decrypted
1460_initial_num_blocks_is_0_decrypt:
1461	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1462%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1463_initial_blocks_decrypted:
1464	cmp	$0, %r13
1465	je	_zero_cipher_left_decrypt
1466	sub	$64, %r13
1467	je	_four_cipher_left_decrypt
1468_decrypt_by_4:
1469	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1470%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1471	add	$64, %r11
1472	sub	$64, %r13
1473	jne	_decrypt_by_4
1474_four_cipher_left_decrypt:
1475	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1476%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1477_zero_cipher_left_decrypt:
1478	mov	%arg4, %r13
1479	and	$15, %r13				# %r13 = arg4 (mod 16)
1480	je	_multiple_of_16_bytes_decrypt
1481
1482        # Handle the last <16 byte block separately
1483
1484	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
1485        movdqa SHUF_MASK(%rip), %xmm10
1486	PSHUFB_XMM %xmm10, %xmm0
1487
1488	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
1489	sub $16, %r11
1490	add %r13, %r11
1491	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
1492	lea SHIFT_MASK+16(%rip), %r12
1493	sub %r13, %r12
1494# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1495# (%r13 is the number of bytes in plaintext mod 16)
1496	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
1497	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
1498
1499	movdqa  %xmm1, %xmm2
1500	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
1501	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1502	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1503	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
1504	pand    %xmm1, %xmm2
1505        movdqa SHUF_MASK(%rip), %xmm10
1506	PSHUFB_XMM %xmm10 ,%xmm2
1507
1508	pxor %xmm2, %xmm8
1509	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1510	          # GHASH computation for the last <16 byte block
1511	sub %r13, %r11
1512	add $16, %r11
1513
1514        # output %r13 bytes
1515	MOVQ_R64_XMM	%xmm0, %rax
1516	cmp	$8, %r13
1517	jle	_less_than_8_bytes_left_decrypt
1518	mov	%rax, (%arg2 , %r11, 1)
1519	add	$8, %r11
1520	psrldq	$8, %xmm0
1521	MOVQ_R64_XMM	%xmm0, %rax
1522	sub	$8, %r13
1523_less_than_8_bytes_left_decrypt:
1524	mov	%al,  (%arg2, %r11, 1)
1525	add	$1, %r11
1526	shr	$8, %rax
1527	sub	$1, %r13
1528	jne	_less_than_8_bytes_left_decrypt
1529_multiple_of_16_bytes_decrypt:
1530	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
1531	shl	$3, %r12		  # convert into number of bits
1532	movd	%r12d, %xmm15		  # len(A) in %xmm15
1533	shl	$3, %arg4		  # len(C) in bits (*128)
1534	MOVQ_R64_XMM	%arg4, %xmm1
1535	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
1536	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
1537	pxor	%xmm15, %xmm8
1538	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1539	         # final GHASH computation
1540        movdqa SHUF_MASK(%rip), %xmm10
1541	PSHUFB_XMM %xmm10, %xmm8
1542
1543	mov	%arg5, %rax		  # %rax = *Y0
1544	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
1545	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
1546	pxor	%xmm8, %xmm0
1547_return_T_decrypt:
1548	mov	arg9, %r10                # %r10 = authTag
1549	mov	arg10, %r11               # %r11 = auth_tag_len
1550	cmp	$16, %r11
1551	je	_T_16_decrypt
1552	cmp	$8, %r11
1553	jl	_T_4_decrypt
1554_T_8_decrypt:
1555	MOVQ_R64_XMM	%xmm0, %rax
1556	mov	%rax, (%r10)
1557	add	$8, %r10
1558	sub	$8, %r11
1559	psrldq	$8, %xmm0
1560	cmp	$0, %r11
1561	je	_return_T_done_decrypt
1562_T_4_decrypt:
1563	movd	%xmm0, %eax
1564	mov	%eax, (%r10)
1565	add	$4, %r10
1566	sub	$4, %r11
1567	psrldq	$4, %xmm0
1568	cmp	$0, %r11
1569	je	_return_T_done_decrypt
1570_T_123_decrypt:
1571	movd	%xmm0, %eax
1572	cmp	$2, %r11
1573	jl	_T_1_decrypt
1574	mov	%ax, (%r10)
1575	cmp	$2, %r11
1576	je	_return_T_done_decrypt
1577	add	$2, %r10
1578	sar	$16, %eax
1579_T_1_decrypt:
1580	mov	%al, (%r10)
1581	jmp	_return_T_done_decrypt
1582_T_16_decrypt:
1583	movdqu	%xmm0, (%r10)
1584_return_T_done_decrypt:
1585	mov	%r14, %rsp
1586	pop	%r14
1587	pop	%r13
1588	pop	%r12
1589	ret
1590ENDPROC(aesni_gcm_dec)
1591
1592
1593/*****************************************************************************
1594* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1595*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1596*                    const u8 *in,       // Plaintext input
1597*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1598*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1599*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1600*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1601*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1602*                    const u8 *aad,      // Additional Authentication Data (AAD)
1603*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1604*                    u8 *auth_tag,       // Authenticated Tag output.
1605*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1606*                                        // 12 or 8.
1607*
1608* Assumptions:
1609*
1610* keys:
1611*       keys are pre-expanded and aligned to 16 bytes. we are using the
1612*       first set of 11 keys in the data structure void *aes_ctx
1613*
1614*
1615* iv:
1616*       0                   1                   2                   3
1617*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1618*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1619*       |                             Salt  (From the SA)               |
1620*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1621*       |                     Initialization Vector                     |
1622*       |         (This is the sequence number from IPSec header)       |
1623*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1624*       |                              0x1                              |
1625*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1626*
1627*
1628*
1629* AAD:
1630*       AAD padded to 128 bits with 0
1631*       for example, assume AAD is a u32 vector
1632*
1633*       if AAD is 8 bytes:
1634*       AAD[3] = {A0, A1};
1635*       padded AAD in xmm register = {A1 A0 0 0}
1636*
1637*       0                   1                   2                   3
1638*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1639*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*       |                               SPI (A1)                        |
1641*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1642*       |                     32-bit Sequence Number (A0)               |
1643*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1644*       |                              0x0                              |
1645*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1646*
1647*                                 AAD Format with 32-bit Sequence Number
1648*
1649*       if AAD is 12 bytes:
1650*       AAD[3] = {A0, A1, A2};
1651*       padded AAD in xmm register = {A2 A1 A0 0}
1652*
1653*       0                   1                   2                   3
1654*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*       |                               SPI (A2)                        |
1657*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658*       |                 64-bit Extended Sequence Number {A1,A0}       |
1659*       |                                                               |
1660*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*       |                              0x0                              |
1662*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663*
1664*                         AAD Format with 64-bit Extended Sequence Number
1665*
1666* aadLen:
1667*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1668*       The code supports 16 too but for other sizes, the code will fail.
1669*
1670* TLen:
1671*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1672*       For other sizes, the code will fail.
1673*
1674* poly = x^128 + x^127 + x^126 + x^121 + 1
1675***************************************************************************/
1676ENTRY(aesni_gcm_enc)
1677	push	%r12
1678	push	%r13
1679	push	%r14
1680	mov	%rsp, %r14
1681#
1682# states of %xmm registers %xmm6:%xmm15 not saved
1683# all %xmm registers are clobbered
1684#
1685	sub	$VARIABLE_OFFSET, %rsp
1686	and	$~63, %rsp
1687	mov	%arg6, %r12
1688	movdqu	(%r12), %xmm13
1689        movdqa  SHUF_MASK(%rip), %xmm2
1690	PSHUFB_XMM %xmm2, %xmm13
1691
1692
1693# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1694
1695	movdqa	%xmm13, %xmm2
1696	psllq	$1, %xmm13
1697	psrlq	$63, %xmm2
1698	movdqa	%xmm2, %xmm1
1699	pslldq	$8, %xmm2
1700	psrldq	$8, %xmm1
1701	por	%xmm2, %xmm13
1702
1703        # reduce HashKey<<1
1704
1705	pshufd	$0x24, %xmm1, %xmm2
1706	pcmpeqd TWOONE(%rip), %xmm2
1707	pand	POLY(%rip), %xmm2
1708	pxor	%xmm2, %xmm13
1709	movdqa	%xmm13, HashKey(%rsp)
1710	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
1711	and	$-16, %r13
1712	mov	%r13, %r12
1713
1714        # Encrypt first few blocks
1715
1716	and	$(3<<4), %r12
1717	jz	_initial_num_blocks_is_0_encrypt
1718	cmp	$(2<<4), %r12
1719	jb	_initial_num_blocks_is_1_encrypt
1720	je	_initial_num_blocks_is_2_encrypt
1721_initial_num_blocks_is_3_encrypt:
1722	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1723%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1724	sub	$48, %r13
1725	jmp	_initial_blocks_encrypted
1726_initial_num_blocks_is_2_encrypt:
1727	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1728%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1729	sub	$32, %r13
1730	jmp	_initial_blocks_encrypted
1731_initial_num_blocks_is_1_encrypt:
1732	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1733%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1734	sub	$16, %r13
1735	jmp	_initial_blocks_encrypted
1736_initial_num_blocks_is_0_encrypt:
1737	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1738%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1739_initial_blocks_encrypted:
1740
1741        # Main loop - Encrypt remaining blocks
1742
1743	cmp	$0, %r13
1744	je	_zero_cipher_left_encrypt
1745	sub	$64, %r13
1746	je	_four_cipher_left_encrypt
1747_encrypt_by_4_encrypt:
1748	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1749%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1750	add	$64, %r11
1751	sub	$64, %r13
1752	jne	_encrypt_by_4_encrypt
1753_four_cipher_left_encrypt:
1754	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1755%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1756_zero_cipher_left_encrypt:
1757	mov	%arg4, %r13
1758	and	$15, %r13			# %r13 = arg4 (mod 16)
1759	je	_multiple_of_16_bytes_encrypt
1760
1761         # Handle the last <16 Byte block separately
1762	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
1763        movdqa SHUF_MASK(%rip), %xmm10
1764	PSHUFB_XMM %xmm10, %xmm0
1765
1766
1767	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
1768	sub $16, %r11
1769	add %r13, %r11
1770	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
1771	lea SHIFT_MASK+16(%rip), %r12
1772	sub %r13, %r12
1773	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1774	# (%r13 is the number of bytes in plaintext mod 16)
1775	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
1776	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
1777	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
1778	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
1779	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1780	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
1781        movdqa SHUF_MASK(%rip), %xmm10
1782	PSHUFB_XMM %xmm10,%xmm0
1783
1784	pxor	%xmm0, %xmm8
1785	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1786	# GHASH computation for the last <16 byte block
1787	sub	%r13, %r11
1788	add	$16, %r11
1789
1790	movdqa SHUF_MASK(%rip), %xmm10
1791	PSHUFB_XMM %xmm10, %xmm0
1792
1793	# shuffle xmm0 back to output as ciphertext
1794
1795        # Output %r13 bytes
1796	MOVQ_R64_XMM %xmm0, %rax
1797	cmp $8, %r13
1798	jle _less_than_8_bytes_left_encrypt
1799	mov %rax, (%arg2 , %r11, 1)
1800	add $8, %r11
1801	psrldq $8, %xmm0
1802	MOVQ_R64_XMM %xmm0, %rax
1803	sub $8, %r13
1804_less_than_8_bytes_left_encrypt:
1805	mov %al,  (%arg2, %r11, 1)
1806	add $1, %r11
1807	shr $8, %rax
1808	sub $1, %r13
1809	jne _less_than_8_bytes_left_encrypt
1810_multiple_of_16_bytes_encrypt:
1811	mov	arg8, %r12    # %r12 = addLen (number of bytes)
1812	shl	$3, %r12
1813	movd	%r12d, %xmm15       # len(A) in %xmm15
1814	shl	$3, %arg4               # len(C) in bits (*128)
1815	MOVQ_R64_XMM	%arg4, %xmm1
1816	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
1817	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
1818	pxor	%xmm15, %xmm8
1819	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1820	# final GHASH computation
1821        movdqa SHUF_MASK(%rip), %xmm10
1822	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
1823
1824	mov	%arg5, %rax		       # %rax  = *Y0
1825	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
1826	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
1827	pxor	%xmm8, %xmm0
1828_return_T_encrypt:
1829	mov	arg9, %r10                     # %r10 = authTag
1830	mov	arg10, %r11                    # %r11 = auth_tag_len
1831	cmp	$16, %r11
1832	je	_T_16_encrypt
1833	cmp	$8, %r11
1834	jl	_T_4_encrypt
1835_T_8_encrypt:
1836	MOVQ_R64_XMM	%xmm0, %rax
1837	mov	%rax, (%r10)
1838	add	$8, %r10
1839	sub	$8, %r11
1840	psrldq	$8, %xmm0
1841	cmp	$0, %r11
1842	je	_return_T_done_encrypt
1843_T_4_encrypt:
1844	movd	%xmm0, %eax
1845	mov	%eax, (%r10)
1846	add	$4, %r10
1847	sub	$4, %r11
1848	psrldq	$4, %xmm0
1849	cmp	$0, %r11
1850	je	_return_T_done_encrypt
1851_T_123_encrypt:
1852	movd	%xmm0, %eax
1853	cmp	$2, %r11
1854	jl	_T_1_encrypt
1855	mov	%ax, (%r10)
1856	cmp	$2, %r11
1857	je	_return_T_done_encrypt
1858	add	$2, %r10
1859	sar	$16, %eax
1860_T_1_encrypt:
1861	mov	%al, (%r10)
1862	jmp	_return_T_done_encrypt
1863_T_16_encrypt:
1864	movdqu	%xmm0, (%r10)
1865_return_T_done_encrypt:
1866	mov	%r14, %rsp
1867	pop	%r14
1868	pop	%r13
1869	pop	%r12
1870	ret
1871ENDPROC(aesni_gcm_enc)
1872
1873#endif
1874
1875
1876.align 4
1877_key_expansion_128:
1878_key_expansion_256a:
1879	pshufd $0b11111111, %xmm1, %xmm1
1880	shufps $0b00010000, %xmm0, %xmm4
1881	pxor %xmm4, %xmm0
1882	shufps $0b10001100, %xmm0, %xmm4
1883	pxor %xmm4, %xmm0
1884	pxor %xmm1, %xmm0
1885	movaps %xmm0, (TKEYP)
1886	add $0x10, TKEYP
1887	ret
1888ENDPROC(_key_expansion_128)
1889ENDPROC(_key_expansion_256a)
1890
1891.align 4
1892_key_expansion_192a:
1893	pshufd $0b01010101, %xmm1, %xmm1
1894	shufps $0b00010000, %xmm0, %xmm4
1895	pxor %xmm4, %xmm0
1896	shufps $0b10001100, %xmm0, %xmm4
1897	pxor %xmm4, %xmm0
1898	pxor %xmm1, %xmm0
1899
1900	movaps %xmm2, %xmm5
1901	movaps %xmm2, %xmm6
1902	pslldq $4, %xmm5
1903	pshufd $0b11111111, %xmm0, %xmm3
1904	pxor %xmm3, %xmm2
1905	pxor %xmm5, %xmm2
1906
1907	movaps %xmm0, %xmm1
1908	shufps $0b01000100, %xmm0, %xmm6
1909	movaps %xmm6, (TKEYP)
1910	shufps $0b01001110, %xmm2, %xmm1
1911	movaps %xmm1, 0x10(TKEYP)
1912	add $0x20, TKEYP
1913	ret
1914ENDPROC(_key_expansion_192a)
1915
1916.align 4
1917_key_expansion_192b:
1918	pshufd $0b01010101, %xmm1, %xmm1
1919	shufps $0b00010000, %xmm0, %xmm4
1920	pxor %xmm4, %xmm0
1921	shufps $0b10001100, %xmm0, %xmm4
1922	pxor %xmm4, %xmm0
1923	pxor %xmm1, %xmm0
1924
1925	movaps %xmm2, %xmm5
1926	pslldq $4, %xmm5
1927	pshufd $0b11111111, %xmm0, %xmm3
1928	pxor %xmm3, %xmm2
1929	pxor %xmm5, %xmm2
1930
1931	movaps %xmm0, (TKEYP)
1932	add $0x10, TKEYP
1933	ret
1934ENDPROC(_key_expansion_192b)
1935
1936.align 4
1937_key_expansion_256b:
1938	pshufd $0b10101010, %xmm1, %xmm1
1939	shufps $0b00010000, %xmm2, %xmm4
1940	pxor %xmm4, %xmm2
1941	shufps $0b10001100, %xmm2, %xmm4
1942	pxor %xmm4, %xmm2
1943	pxor %xmm1, %xmm2
1944	movaps %xmm2, (TKEYP)
1945	add $0x10, TKEYP
1946	ret
1947ENDPROC(_key_expansion_256b)
1948
1949/*
1950 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1951 *                   unsigned int key_len)
1952 */
1953ENTRY(aesni_set_key)
1954	FRAME_BEGIN
1955#ifndef __x86_64__
1956	pushl KEYP
1957	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1958	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1959	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1960#endif
1961	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1962	movaps %xmm0, (KEYP)
1963	lea 0x10(KEYP), TKEYP		# key addr
1964	movl %edx, 480(KEYP)
1965	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1966	cmp $24, %dl
1967	jb .Lenc_key128
1968	je .Lenc_key192
1969	movups 0x10(UKEYP), %xmm2	# other user key
1970	movaps %xmm2, (TKEYP)
1971	add $0x10, TKEYP
1972	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1973	call _key_expansion_256a
1974	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1975	call _key_expansion_256b
1976	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1977	call _key_expansion_256a
1978	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1979	call _key_expansion_256b
1980	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1981	call _key_expansion_256a
1982	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1983	call _key_expansion_256b
1984	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1985	call _key_expansion_256a
1986	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1987	call _key_expansion_256b
1988	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1989	call _key_expansion_256a
1990	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1991	call _key_expansion_256b
1992	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1993	call _key_expansion_256a
1994	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1995	call _key_expansion_256b
1996	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1997	call _key_expansion_256a
1998	jmp .Ldec_key
1999.Lenc_key192:
2000	movq 0x10(UKEYP), %xmm2		# other user key
2001	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
2002	call _key_expansion_192a
2003	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
2004	call _key_expansion_192b
2005	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
2006	call _key_expansion_192a
2007	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
2008	call _key_expansion_192b
2009	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
2010	call _key_expansion_192a
2011	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
2012	call _key_expansion_192b
2013	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
2014	call _key_expansion_192a
2015	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
2016	call _key_expansion_192b
2017	jmp .Ldec_key
2018.Lenc_key128:
2019	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
2020	call _key_expansion_128
2021	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
2022	call _key_expansion_128
2023	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
2024	call _key_expansion_128
2025	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
2026	call _key_expansion_128
2027	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
2028	call _key_expansion_128
2029	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
2030	call _key_expansion_128
2031	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
2032	call _key_expansion_128
2033	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
2034	call _key_expansion_128
2035	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
2036	call _key_expansion_128
2037	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
2038	call _key_expansion_128
2039.Ldec_key:
2040	sub $0x10, TKEYP
2041	movaps (KEYP), %xmm0
2042	movaps (TKEYP), %xmm1
2043	movaps %xmm0, 240(TKEYP)
2044	movaps %xmm1, 240(KEYP)
2045	add $0x10, KEYP
2046	lea 240-16(TKEYP), UKEYP
2047.align 4
2048.Ldec_key_loop:
2049	movaps (KEYP), %xmm0
2050	AESIMC %xmm0 %xmm1
2051	movaps %xmm1, (UKEYP)
2052	add $0x10, KEYP
2053	sub $0x10, UKEYP
2054	cmp TKEYP, KEYP
2055	jb .Ldec_key_loop
2056	xor AREG, AREG
2057#ifndef __x86_64__
2058	popl KEYP
2059#endif
2060	FRAME_END
2061	ret
2062ENDPROC(aesni_set_key)
2063
2064/*
2065 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2066 */
2067ENTRY(aesni_enc)
2068	FRAME_BEGIN
2069#ifndef __x86_64__
2070	pushl KEYP
2071	pushl KLEN
2072	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2073	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2074	movl (FRAME_OFFSET+20)(%esp), INP	# src
2075#endif
2076	movl 480(KEYP), KLEN		# key length
2077	movups (INP), STATE		# input
2078	call _aesni_enc1
2079	movups STATE, (OUTP)		# output
2080#ifndef __x86_64__
2081	popl KLEN
2082	popl KEYP
2083#endif
2084	FRAME_END
2085	ret
2086ENDPROC(aesni_enc)
2087
2088/*
2089 * _aesni_enc1:		internal ABI
2090 * input:
2091 *	KEYP:		key struct pointer
2092 *	KLEN:		round count
2093 *	STATE:		initial state (input)
2094 * output:
2095 *	STATE:		finial state (output)
2096 * changed:
2097 *	KEY
2098 *	TKEYP (T1)
2099 */
2100.align 4
2101_aesni_enc1:
2102	movaps (KEYP), KEY		# key
2103	mov KEYP, TKEYP
2104	pxor KEY, STATE		# round 0
2105	add $0x30, TKEYP
2106	cmp $24, KLEN
2107	jb .Lenc128
2108	lea 0x20(TKEYP), TKEYP
2109	je .Lenc192
2110	add $0x20, TKEYP
2111	movaps -0x60(TKEYP), KEY
2112	AESENC KEY STATE
2113	movaps -0x50(TKEYP), KEY
2114	AESENC KEY STATE
2115.align 4
2116.Lenc192:
2117	movaps -0x40(TKEYP), KEY
2118	AESENC KEY STATE
2119	movaps -0x30(TKEYP), KEY
2120	AESENC KEY STATE
2121.align 4
2122.Lenc128:
2123	movaps -0x20(TKEYP), KEY
2124	AESENC KEY STATE
2125	movaps -0x10(TKEYP), KEY
2126	AESENC KEY STATE
2127	movaps (TKEYP), KEY
2128	AESENC KEY STATE
2129	movaps 0x10(TKEYP), KEY
2130	AESENC KEY STATE
2131	movaps 0x20(TKEYP), KEY
2132	AESENC KEY STATE
2133	movaps 0x30(TKEYP), KEY
2134	AESENC KEY STATE
2135	movaps 0x40(TKEYP), KEY
2136	AESENC KEY STATE
2137	movaps 0x50(TKEYP), KEY
2138	AESENC KEY STATE
2139	movaps 0x60(TKEYP), KEY
2140	AESENC KEY STATE
2141	movaps 0x70(TKEYP), KEY
2142	AESENCLAST KEY STATE
2143	ret
2144ENDPROC(_aesni_enc1)
2145
2146/*
2147 * _aesni_enc4:	internal ABI
2148 * input:
2149 *	KEYP:		key struct pointer
2150 *	KLEN:		round count
2151 *	STATE1:		initial state (input)
2152 *	STATE2
2153 *	STATE3
2154 *	STATE4
2155 * output:
2156 *	STATE1:		finial state (output)
2157 *	STATE2
2158 *	STATE3
2159 *	STATE4
2160 * changed:
2161 *	KEY
2162 *	TKEYP (T1)
2163 */
2164.align 4
2165_aesni_enc4:
2166	movaps (KEYP), KEY		# key
2167	mov KEYP, TKEYP
2168	pxor KEY, STATE1		# round 0
2169	pxor KEY, STATE2
2170	pxor KEY, STATE3
2171	pxor KEY, STATE4
2172	add $0x30, TKEYP
2173	cmp $24, KLEN
2174	jb .L4enc128
2175	lea 0x20(TKEYP), TKEYP
2176	je .L4enc192
2177	add $0x20, TKEYP
2178	movaps -0x60(TKEYP), KEY
2179	AESENC KEY STATE1
2180	AESENC KEY STATE2
2181	AESENC KEY STATE3
2182	AESENC KEY STATE4
2183	movaps -0x50(TKEYP), KEY
2184	AESENC KEY STATE1
2185	AESENC KEY STATE2
2186	AESENC KEY STATE3
2187	AESENC KEY STATE4
2188#.align 4
2189.L4enc192:
2190	movaps -0x40(TKEYP), KEY
2191	AESENC KEY STATE1
2192	AESENC KEY STATE2
2193	AESENC KEY STATE3
2194	AESENC KEY STATE4
2195	movaps -0x30(TKEYP), KEY
2196	AESENC KEY STATE1
2197	AESENC KEY STATE2
2198	AESENC KEY STATE3
2199	AESENC KEY STATE4
2200#.align 4
2201.L4enc128:
2202	movaps -0x20(TKEYP), KEY
2203	AESENC KEY STATE1
2204	AESENC KEY STATE2
2205	AESENC KEY STATE3
2206	AESENC KEY STATE4
2207	movaps -0x10(TKEYP), KEY
2208	AESENC KEY STATE1
2209	AESENC KEY STATE2
2210	AESENC KEY STATE3
2211	AESENC KEY STATE4
2212	movaps (TKEYP), KEY
2213	AESENC KEY STATE1
2214	AESENC KEY STATE2
2215	AESENC KEY STATE3
2216	AESENC KEY STATE4
2217	movaps 0x10(TKEYP), KEY
2218	AESENC KEY STATE1
2219	AESENC KEY STATE2
2220	AESENC KEY STATE3
2221	AESENC KEY STATE4
2222	movaps 0x20(TKEYP), KEY
2223	AESENC KEY STATE1
2224	AESENC KEY STATE2
2225	AESENC KEY STATE3
2226	AESENC KEY STATE4
2227	movaps 0x30(TKEYP), KEY
2228	AESENC KEY STATE1
2229	AESENC KEY STATE2
2230	AESENC KEY STATE3
2231	AESENC KEY STATE4
2232	movaps 0x40(TKEYP), KEY
2233	AESENC KEY STATE1
2234	AESENC KEY STATE2
2235	AESENC KEY STATE3
2236	AESENC KEY STATE4
2237	movaps 0x50(TKEYP), KEY
2238	AESENC KEY STATE1
2239	AESENC KEY STATE2
2240	AESENC KEY STATE3
2241	AESENC KEY STATE4
2242	movaps 0x60(TKEYP), KEY
2243	AESENC KEY STATE1
2244	AESENC KEY STATE2
2245	AESENC KEY STATE3
2246	AESENC KEY STATE4
2247	movaps 0x70(TKEYP), KEY
2248	AESENCLAST KEY STATE1		# last round
2249	AESENCLAST KEY STATE2
2250	AESENCLAST KEY STATE3
2251	AESENCLAST KEY STATE4
2252	ret
2253ENDPROC(_aesni_enc4)
2254
2255/*
2256 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2257 */
2258ENTRY(aesni_dec)
2259	FRAME_BEGIN
2260#ifndef __x86_64__
2261	pushl KEYP
2262	pushl KLEN
2263	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2264	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2265	movl (FRAME_OFFSET+20)(%esp), INP	# src
2266#endif
2267	mov 480(KEYP), KLEN		# key length
2268	add $240, KEYP
2269	movups (INP), STATE		# input
2270	call _aesni_dec1
2271	movups STATE, (OUTP)		#output
2272#ifndef __x86_64__
2273	popl KLEN
2274	popl KEYP
2275#endif
2276	FRAME_END
2277	ret
2278ENDPROC(aesni_dec)
2279
2280/*
2281 * _aesni_dec1:		internal ABI
2282 * input:
2283 *	KEYP:		key struct pointer
2284 *	KLEN:		key length
2285 *	STATE:		initial state (input)
2286 * output:
2287 *	STATE:		finial state (output)
2288 * changed:
2289 *	KEY
2290 *	TKEYP (T1)
2291 */
2292.align 4
2293_aesni_dec1:
2294	movaps (KEYP), KEY		# key
2295	mov KEYP, TKEYP
2296	pxor KEY, STATE		# round 0
2297	add $0x30, TKEYP
2298	cmp $24, KLEN
2299	jb .Ldec128
2300	lea 0x20(TKEYP), TKEYP
2301	je .Ldec192
2302	add $0x20, TKEYP
2303	movaps -0x60(TKEYP), KEY
2304	AESDEC KEY STATE
2305	movaps -0x50(TKEYP), KEY
2306	AESDEC KEY STATE
2307.align 4
2308.Ldec192:
2309	movaps -0x40(TKEYP), KEY
2310	AESDEC KEY STATE
2311	movaps -0x30(TKEYP), KEY
2312	AESDEC KEY STATE
2313.align 4
2314.Ldec128:
2315	movaps -0x20(TKEYP), KEY
2316	AESDEC KEY STATE
2317	movaps -0x10(TKEYP), KEY
2318	AESDEC KEY STATE
2319	movaps (TKEYP), KEY
2320	AESDEC KEY STATE
2321	movaps 0x10(TKEYP), KEY
2322	AESDEC KEY STATE
2323	movaps 0x20(TKEYP), KEY
2324	AESDEC KEY STATE
2325	movaps 0x30(TKEYP), KEY
2326	AESDEC KEY STATE
2327	movaps 0x40(TKEYP), KEY
2328	AESDEC KEY STATE
2329	movaps 0x50(TKEYP), KEY
2330	AESDEC KEY STATE
2331	movaps 0x60(TKEYP), KEY
2332	AESDEC KEY STATE
2333	movaps 0x70(TKEYP), KEY
2334	AESDECLAST KEY STATE
2335	ret
2336ENDPROC(_aesni_dec1)
2337
2338/*
2339 * _aesni_dec4:	internal ABI
2340 * input:
2341 *	KEYP:		key struct pointer
2342 *	KLEN:		key length
2343 *	STATE1:		initial state (input)
2344 *	STATE2
2345 *	STATE3
2346 *	STATE4
2347 * output:
2348 *	STATE1:		finial state (output)
2349 *	STATE2
2350 *	STATE3
2351 *	STATE4
2352 * changed:
2353 *	KEY
2354 *	TKEYP (T1)
2355 */
2356.align 4
2357_aesni_dec4:
2358	movaps (KEYP), KEY		# key
2359	mov KEYP, TKEYP
2360	pxor KEY, STATE1		# round 0
2361	pxor KEY, STATE2
2362	pxor KEY, STATE3
2363	pxor KEY, STATE4
2364	add $0x30, TKEYP
2365	cmp $24, KLEN
2366	jb .L4dec128
2367	lea 0x20(TKEYP), TKEYP
2368	je .L4dec192
2369	add $0x20, TKEYP
2370	movaps -0x60(TKEYP), KEY
2371	AESDEC KEY STATE1
2372	AESDEC KEY STATE2
2373	AESDEC KEY STATE3
2374	AESDEC KEY STATE4
2375	movaps -0x50(TKEYP), KEY
2376	AESDEC KEY STATE1
2377	AESDEC KEY STATE2
2378	AESDEC KEY STATE3
2379	AESDEC KEY STATE4
2380.align 4
2381.L4dec192:
2382	movaps -0x40(TKEYP), KEY
2383	AESDEC KEY STATE1
2384	AESDEC KEY STATE2
2385	AESDEC KEY STATE3
2386	AESDEC KEY STATE4
2387	movaps -0x30(TKEYP), KEY
2388	AESDEC KEY STATE1
2389	AESDEC KEY STATE2
2390	AESDEC KEY STATE3
2391	AESDEC KEY STATE4
2392.align 4
2393.L4dec128:
2394	movaps -0x20(TKEYP), KEY
2395	AESDEC KEY STATE1
2396	AESDEC KEY STATE2
2397	AESDEC KEY STATE3
2398	AESDEC KEY STATE4
2399	movaps -0x10(TKEYP), KEY
2400	AESDEC KEY STATE1
2401	AESDEC KEY STATE2
2402	AESDEC KEY STATE3
2403	AESDEC KEY STATE4
2404	movaps (TKEYP), KEY
2405	AESDEC KEY STATE1
2406	AESDEC KEY STATE2
2407	AESDEC KEY STATE3
2408	AESDEC KEY STATE4
2409	movaps 0x10(TKEYP), KEY
2410	AESDEC KEY STATE1
2411	AESDEC KEY STATE2
2412	AESDEC KEY STATE3
2413	AESDEC KEY STATE4
2414	movaps 0x20(TKEYP), KEY
2415	AESDEC KEY STATE1
2416	AESDEC KEY STATE2
2417	AESDEC KEY STATE3
2418	AESDEC KEY STATE4
2419	movaps 0x30(TKEYP), KEY
2420	AESDEC KEY STATE1
2421	AESDEC KEY STATE2
2422	AESDEC KEY STATE3
2423	AESDEC KEY STATE4
2424	movaps 0x40(TKEYP), KEY
2425	AESDEC KEY STATE1
2426	AESDEC KEY STATE2
2427	AESDEC KEY STATE3
2428	AESDEC KEY STATE4
2429	movaps 0x50(TKEYP), KEY
2430	AESDEC KEY STATE1
2431	AESDEC KEY STATE2
2432	AESDEC KEY STATE3
2433	AESDEC KEY STATE4
2434	movaps 0x60(TKEYP), KEY
2435	AESDEC KEY STATE1
2436	AESDEC KEY STATE2
2437	AESDEC KEY STATE3
2438	AESDEC KEY STATE4
2439	movaps 0x70(TKEYP), KEY
2440	AESDECLAST KEY STATE1		# last round
2441	AESDECLAST KEY STATE2
2442	AESDECLAST KEY STATE3
2443	AESDECLAST KEY STATE4
2444	ret
2445ENDPROC(_aesni_dec4)
2446
2447/*
2448 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2449 *		      size_t len)
2450 */
2451ENTRY(aesni_ecb_enc)
2452	FRAME_BEGIN
2453#ifndef __x86_64__
2454	pushl LEN
2455	pushl KEYP
2456	pushl KLEN
2457	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2458	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2459	movl (FRAME_OFFSET+24)(%esp), INP	# src
2460	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2461#endif
2462	test LEN, LEN		# check length
2463	jz .Lecb_enc_ret
2464	mov 480(KEYP), KLEN
2465	cmp $16, LEN
2466	jb .Lecb_enc_ret
2467	cmp $64, LEN
2468	jb .Lecb_enc_loop1
2469.align 4
2470.Lecb_enc_loop4:
2471	movups (INP), STATE1
2472	movups 0x10(INP), STATE2
2473	movups 0x20(INP), STATE3
2474	movups 0x30(INP), STATE4
2475	call _aesni_enc4
2476	movups STATE1, (OUTP)
2477	movups STATE2, 0x10(OUTP)
2478	movups STATE3, 0x20(OUTP)
2479	movups STATE4, 0x30(OUTP)
2480	sub $64, LEN
2481	add $64, INP
2482	add $64, OUTP
2483	cmp $64, LEN
2484	jge .Lecb_enc_loop4
2485	cmp $16, LEN
2486	jb .Lecb_enc_ret
2487.align 4
2488.Lecb_enc_loop1:
2489	movups (INP), STATE1
2490	call _aesni_enc1
2491	movups STATE1, (OUTP)
2492	sub $16, LEN
2493	add $16, INP
2494	add $16, OUTP
2495	cmp $16, LEN
2496	jge .Lecb_enc_loop1
2497.Lecb_enc_ret:
2498#ifndef __x86_64__
2499	popl KLEN
2500	popl KEYP
2501	popl LEN
2502#endif
2503	FRAME_END
2504	ret
2505ENDPROC(aesni_ecb_enc)
2506
2507/*
2508 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2509 *		      size_t len);
2510 */
2511ENTRY(aesni_ecb_dec)
2512	FRAME_BEGIN
2513#ifndef __x86_64__
2514	pushl LEN
2515	pushl KEYP
2516	pushl KLEN
2517	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2518	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2519	movl (FRAME_OFFSET+24)(%esp), INP	# src
2520	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2521#endif
2522	test LEN, LEN
2523	jz .Lecb_dec_ret
2524	mov 480(KEYP), KLEN
2525	add $240, KEYP
2526	cmp $16, LEN
2527	jb .Lecb_dec_ret
2528	cmp $64, LEN
2529	jb .Lecb_dec_loop1
2530.align 4
2531.Lecb_dec_loop4:
2532	movups (INP), STATE1
2533	movups 0x10(INP), STATE2
2534	movups 0x20(INP), STATE3
2535	movups 0x30(INP), STATE4
2536	call _aesni_dec4
2537	movups STATE1, (OUTP)
2538	movups STATE2, 0x10(OUTP)
2539	movups STATE3, 0x20(OUTP)
2540	movups STATE4, 0x30(OUTP)
2541	sub $64, LEN
2542	add $64, INP
2543	add $64, OUTP
2544	cmp $64, LEN
2545	jge .Lecb_dec_loop4
2546	cmp $16, LEN
2547	jb .Lecb_dec_ret
2548.align 4
2549.Lecb_dec_loop1:
2550	movups (INP), STATE1
2551	call _aesni_dec1
2552	movups STATE1, (OUTP)
2553	sub $16, LEN
2554	add $16, INP
2555	add $16, OUTP
2556	cmp $16, LEN
2557	jge .Lecb_dec_loop1
2558.Lecb_dec_ret:
2559#ifndef __x86_64__
2560	popl KLEN
2561	popl KEYP
2562	popl LEN
2563#endif
2564	FRAME_END
2565	ret
2566ENDPROC(aesni_ecb_dec)
2567
2568/*
2569 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2570 *		      size_t len, u8 *iv)
2571 */
2572ENTRY(aesni_cbc_enc)
2573	FRAME_BEGIN
2574#ifndef __x86_64__
2575	pushl IVP
2576	pushl LEN
2577	pushl KEYP
2578	pushl KLEN
2579	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2580	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2581	movl (FRAME_OFFSET+28)(%esp), INP	# src
2582	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2583	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2584#endif
2585	cmp $16, LEN
2586	jb .Lcbc_enc_ret
2587	mov 480(KEYP), KLEN
2588	movups (IVP), STATE	# load iv as initial state
2589.align 4
2590.Lcbc_enc_loop:
2591	movups (INP), IN	# load input
2592	pxor IN, STATE
2593	call _aesni_enc1
2594	movups STATE, (OUTP)	# store output
2595	sub $16, LEN
2596	add $16, INP
2597	add $16, OUTP
2598	cmp $16, LEN
2599	jge .Lcbc_enc_loop
2600	movups STATE, (IVP)
2601.Lcbc_enc_ret:
2602#ifndef __x86_64__
2603	popl KLEN
2604	popl KEYP
2605	popl LEN
2606	popl IVP
2607#endif
2608	FRAME_END
2609	ret
2610ENDPROC(aesni_cbc_enc)
2611
2612/*
2613 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2614 *		      size_t len, u8 *iv)
2615 */
2616ENTRY(aesni_cbc_dec)
2617	FRAME_BEGIN
2618#ifndef __x86_64__
2619	pushl IVP
2620	pushl LEN
2621	pushl KEYP
2622	pushl KLEN
2623	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2624	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2625	movl (FRAME_OFFSET+28)(%esp), INP	# src
2626	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2627	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2628#endif
2629	cmp $16, LEN
2630	jb .Lcbc_dec_just_ret
2631	mov 480(KEYP), KLEN
2632	add $240, KEYP
2633	movups (IVP), IV
2634	cmp $64, LEN
2635	jb .Lcbc_dec_loop1
2636.align 4
2637.Lcbc_dec_loop4:
2638	movups (INP), IN1
2639	movaps IN1, STATE1
2640	movups 0x10(INP), IN2
2641	movaps IN2, STATE2
2642#ifdef __x86_64__
2643	movups 0x20(INP), IN3
2644	movaps IN3, STATE3
2645	movups 0x30(INP), IN4
2646	movaps IN4, STATE4
2647#else
2648	movups 0x20(INP), IN1
2649	movaps IN1, STATE3
2650	movups 0x30(INP), IN2
2651	movaps IN2, STATE4
2652#endif
2653	call _aesni_dec4
2654	pxor IV, STATE1
2655#ifdef __x86_64__
2656	pxor IN1, STATE2
2657	pxor IN2, STATE3
2658	pxor IN3, STATE4
2659	movaps IN4, IV
2660#else
2661	pxor IN1, STATE4
2662	movaps IN2, IV
2663	movups (INP), IN1
2664	pxor IN1, STATE2
2665	movups 0x10(INP), IN2
2666	pxor IN2, STATE3
2667#endif
2668	movups STATE1, (OUTP)
2669	movups STATE2, 0x10(OUTP)
2670	movups STATE3, 0x20(OUTP)
2671	movups STATE4, 0x30(OUTP)
2672	sub $64, LEN
2673	add $64, INP
2674	add $64, OUTP
2675	cmp $64, LEN
2676	jge .Lcbc_dec_loop4
2677	cmp $16, LEN
2678	jb .Lcbc_dec_ret
2679.align 4
2680.Lcbc_dec_loop1:
2681	movups (INP), IN
2682	movaps IN, STATE
2683	call _aesni_dec1
2684	pxor IV, STATE
2685	movups STATE, (OUTP)
2686	movaps IN, IV
2687	sub $16, LEN
2688	add $16, INP
2689	add $16, OUTP
2690	cmp $16, LEN
2691	jge .Lcbc_dec_loop1
2692.Lcbc_dec_ret:
2693	movups IV, (IVP)
2694.Lcbc_dec_just_ret:
2695#ifndef __x86_64__
2696	popl KLEN
2697	popl KEYP
2698	popl LEN
2699	popl IVP
2700#endif
2701	FRAME_END
2702	ret
2703ENDPROC(aesni_cbc_dec)
2704
2705#ifdef __x86_64__
2706.pushsection .rodata
2707.align 16
2708.Lbswap_mask:
2709	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2710.popsection
2711
2712/*
2713 * _aesni_inc_init:	internal ABI
2714 *	setup registers used by _aesni_inc
2715 * input:
2716 *	IV
2717 * output:
2718 *	CTR:	== IV, in little endian
2719 *	TCTR_LOW: == lower qword of CTR
2720 *	INC:	== 1, in little endian
2721 *	BSWAP_MASK == endian swapping mask
2722 */
2723.align 4
2724_aesni_inc_init:
2725	movaps .Lbswap_mask, BSWAP_MASK
2726	movaps IV, CTR
2727	PSHUFB_XMM BSWAP_MASK CTR
2728	mov $1, TCTR_LOW
2729	MOVQ_R64_XMM TCTR_LOW INC
2730	MOVQ_R64_XMM CTR TCTR_LOW
2731	ret
2732ENDPROC(_aesni_inc_init)
2733
2734/*
2735 * _aesni_inc:		internal ABI
2736 *	Increase IV by 1, IV is in big endian
2737 * input:
2738 *	IV
2739 *	CTR:	== IV, in little endian
2740 *	TCTR_LOW: == lower qword of CTR
2741 *	INC:	== 1, in little endian
2742 *	BSWAP_MASK == endian swapping mask
2743 * output:
2744 *	IV:	Increase by 1
2745 * changed:
2746 *	CTR:	== output IV, in little endian
2747 *	TCTR_LOW: == lower qword of CTR
2748 */
2749.align 4
2750_aesni_inc:
2751	paddq INC, CTR
2752	add $1, TCTR_LOW
2753	jnc .Linc_low
2754	pslldq $8, INC
2755	paddq INC, CTR
2756	psrldq $8, INC
2757.Linc_low:
2758	movaps CTR, IV
2759	PSHUFB_XMM BSWAP_MASK IV
2760	ret
2761ENDPROC(_aesni_inc)
2762
2763/*
2764 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2765 *		      size_t len, u8 *iv)
2766 */
2767ENTRY(aesni_ctr_enc)
2768	FRAME_BEGIN
2769	cmp $16, LEN
2770	jb .Lctr_enc_just_ret
2771	mov 480(KEYP), KLEN
2772	movups (IVP), IV
2773	call _aesni_inc_init
2774	cmp $64, LEN
2775	jb .Lctr_enc_loop1
2776.align 4
2777.Lctr_enc_loop4:
2778	movaps IV, STATE1
2779	call _aesni_inc
2780	movups (INP), IN1
2781	movaps IV, STATE2
2782	call _aesni_inc
2783	movups 0x10(INP), IN2
2784	movaps IV, STATE3
2785	call _aesni_inc
2786	movups 0x20(INP), IN3
2787	movaps IV, STATE4
2788	call _aesni_inc
2789	movups 0x30(INP), IN4
2790	call _aesni_enc4
2791	pxor IN1, STATE1
2792	movups STATE1, (OUTP)
2793	pxor IN2, STATE2
2794	movups STATE2, 0x10(OUTP)
2795	pxor IN3, STATE3
2796	movups STATE3, 0x20(OUTP)
2797	pxor IN4, STATE4
2798	movups STATE4, 0x30(OUTP)
2799	sub $64, LEN
2800	add $64, INP
2801	add $64, OUTP
2802	cmp $64, LEN
2803	jge .Lctr_enc_loop4
2804	cmp $16, LEN
2805	jb .Lctr_enc_ret
2806.align 4
2807.Lctr_enc_loop1:
2808	movaps IV, STATE
2809	call _aesni_inc
2810	movups (INP), IN
2811	call _aesni_enc1
2812	pxor IN, STATE
2813	movups STATE, (OUTP)
2814	sub $16, LEN
2815	add $16, INP
2816	add $16, OUTP
2817	cmp $16, LEN
2818	jge .Lctr_enc_loop1
2819.Lctr_enc_ret:
2820	movups IV, (IVP)
2821.Lctr_enc_just_ret:
2822	FRAME_END
2823	ret
2824ENDPROC(aesni_ctr_enc)
2825
2826/*
2827 * _aesni_gf128mul_x_ble:		internal ABI
2828 *	Multiply in GF(2^128) for XTS IVs
2829 * input:
2830 *	IV:	current IV
2831 *	GF128MUL_MASK == mask with 0x87 and 0x01
2832 * output:
2833 *	IV:	next IV
2834 * changed:
2835 *	CTR:	== temporary value
2836 */
2837#define _aesni_gf128mul_x_ble() \
2838	pshufd $0x13, IV, CTR; \
2839	paddq IV, IV; \
2840	psrad $31, CTR; \
2841	pand GF128MUL_MASK, CTR; \
2842	pxor CTR, IV;
2843
2844/*
2845 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2846 *			 bool enc, u8 *iv)
2847 */
2848ENTRY(aesni_xts_crypt8)
2849	FRAME_BEGIN
2850	cmpb $0, %cl
2851	movl $0, %ecx
2852	movl $240, %r10d
2853	leaq _aesni_enc4, %r11
2854	leaq _aesni_dec4, %rax
2855	cmovel %r10d, %ecx
2856	cmoveq %rax, %r11
2857
2858	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2859	movups (IVP), IV
2860
2861	mov 480(KEYP), KLEN
2862	addq %rcx, KEYP
2863
2864	movdqa IV, STATE1
2865	movdqu 0x00(INP), INC
2866	pxor INC, STATE1
2867	movdqu IV, 0x00(OUTP)
2868
2869	_aesni_gf128mul_x_ble()
2870	movdqa IV, STATE2
2871	movdqu 0x10(INP), INC
2872	pxor INC, STATE2
2873	movdqu IV, 0x10(OUTP)
2874
2875	_aesni_gf128mul_x_ble()
2876	movdqa IV, STATE3
2877	movdqu 0x20(INP), INC
2878	pxor INC, STATE3
2879	movdqu IV, 0x20(OUTP)
2880
2881	_aesni_gf128mul_x_ble()
2882	movdqa IV, STATE4
2883	movdqu 0x30(INP), INC
2884	pxor INC, STATE4
2885	movdqu IV, 0x30(OUTP)
2886
2887	call *%r11
2888
2889	movdqu 0x00(OUTP), INC
2890	pxor INC, STATE1
2891	movdqu STATE1, 0x00(OUTP)
2892
2893	_aesni_gf128mul_x_ble()
2894	movdqa IV, STATE1
2895	movdqu 0x40(INP), INC
2896	pxor INC, STATE1
2897	movdqu IV, 0x40(OUTP)
2898
2899	movdqu 0x10(OUTP), INC
2900	pxor INC, STATE2
2901	movdqu STATE2, 0x10(OUTP)
2902
2903	_aesni_gf128mul_x_ble()
2904	movdqa IV, STATE2
2905	movdqu 0x50(INP), INC
2906	pxor INC, STATE2
2907	movdqu IV, 0x50(OUTP)
2908
2909	movdqu 0x20(OUTP), INC
2910	pxor INC, STATE3
2911	movdqu STATE3, 0x20(OUTP)
2912
2913	_aesni_gf128mul_x_ble()
2914	movdqa IV, STATE3
2915	movdqu 0x60(INP), INC
2916	pxor INC, STATE3
2917	movdqu IV, 0x60(OUTP)
2918
2919	movdqu 0x30(OUTP), INC
2920	pxor INC, STATE4
2921	movdqu STATE4, 0x30(OUTP)
2922
2923	_aesni_gf128mul_x_ble()
2924	movdqa IV, STATE4
2925	movdqu 0x70(INP), INC
2926	pxor INC, STATE4
2927	movdqu IV, 0x70(OUTP)
2928
2929	_aesni_gf128mul_x_ble()
2930	movups IV, (IVP)
2931
2932	call *%r11
2933
2934	movdqu 0x40(OUTP), INC
2935	pxor INC, STATE1
2936	movdqu STATE1, 0x40(OUTP)
2937
2938	movdqu 0x50(OUTP), INC
2939	pxor INC, STATE2
2940	movdqu STATE2, 0x50(OUTP)
2941
2942	movdqu 0x60(OUTP), INC
2943	pxor INC, STATE3
2944	movdqu STATE3, 0x60(OUTP)
2945
2946	movdqu 0x70(OUTP), INC
2947	pxor INC, STATE4
2948	movdqu STATE4, 0x70(OUTP)
2949
2950	FRAME_END
2951	ret
2952ENDPROC(aesni_xts_crypt8)
2953
2954#endif
2955