xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision 4f727ecefefbd180de10e25b3e74c03dce3f1e75)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17 *             Adrian Hoban <adrian.hoban@intel.com>
18 *             James Guilford (james.guilford@intel.com)
19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20 *             Tadeusz Struk (tadeusz.struk@intel.com)
21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22 *    Copyright (c) 2010, Intel Corporation.
23 *
24 * Ported x86_64 version to x86:
25 *    Author: Mathias Krause <minipli@googlemail.com>
26 */
27
28#include <linux/linkage.h>
29#include <asm/inst.h>
30#include <asm/frame.h>
31#include <asm/nospec-branch.h>
32
33/*
34 * The following macros are used to move an (un)aligned 16 byte value to/from
35 * an XMM register.  This can done for either FP or integer values, for FP use
36 * movaps (move aligned packed single) or integer use movdqa (move double quad
37 * aligned).  It doesn't make a performance difference which instruction is used
38 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
39 * shorter, so that is the one we'll use for now. (same for unaligned).
40 */
41#define MOVADQ	movaps
42#define MOVUDQ	movups
43
44#ifdef __x86_64__
45
46# constants in mergeable sections, linker can reorder and merge
47.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
48.align 16
49.Lgf128mul_x_ble_mask:
50	.octa 0x00000000000000010000000000000087
51.section	.rodata.cst16.POLY, "aM", @progbits, 16
52.align 16
53POLY:   .octa 0xC2000000000000000000000000000001
54.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
55.align 16
56TWOONE: .octa 0x00000001000000000000000000000001
57
58.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
59.align 16
60SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
61.section	.rodata.cst16.MASK1, "aM", @progbits, 16
62.align 16
63MASK1:      .octa 0x0000000000000000ffffffffffffffff
64.section	.rodata.cst16.MASK2, "aM", @progbits, 16
65.align 16
66MASK2:      .octa 0xffffffffffffffff0000000000000000
67.section	.rodata.cst16.ONE, "aM", @progbits, 16
68.align 16
69ONE:        .octa 0x00000000000000000000000000000001
70.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
71.align 16
72F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
73.section	.rodata.cst16.dec, "aM", @progbits, 16
74.align 16
75dec:        .octa 0x1
76.section	.rodata.cst16.enc, "aM", @progbits, 16
77.align 16
78enc:        .octa 0x2
79
80# order of these constants should not change.
81# more specifically, ALL_F should follow SHIFT_MASK,
82# and zero should follow ALL_F
83.section	.rodata, "a", @progbits
84.align 16
85SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
86ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
87            .octa 0x00000000000000000000000000000000
88
89.text
90
91
92#define	STACK_OFFSET    8*3
93
94#define AadHash 16*0
95#define AadLen 16*1
96#define InLen (16*1)+8
97#define PBlockEncKey 16*2
98#define OrigIV 16*3
99#define CurCount 16*4
100#define PBlockLen 16*5
101#define	HashKey		16*6	// store HashKey <<1 mod poly here
102#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
103#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
104#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
105#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
106				// bits of  HashKey <<1 mod poly here
107				//(for Karatsuba purposes)
108#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
109				// bits of  HashKey^2 <<1 mod poly here
110				// (for Karatsuba purposes)
111#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
112				// bits of  HashKey^3 <<1 mod poly here
113				// (for Karatsuba purposes)
114#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
115				// bits of  HashKey^4 <<1 mod poly here
116				// (for Karatsuba purposes)
117
118#define arg1 rdi
119#define arg2 rsi
120#define arg3 rdx
121#define arg4 rcx
122#define arg5 r8
123#define arg6 r9
124#define arg7 STACK_OFFSET+8(%rsp)
125#define arg8 STACK_OFFSET+16(%rsp)
126#define arg9 STACK_OFFSET+24(%rsp)
127#define arg10 STACK_OFFSET+32(%rsp)
128#define arg11 STACK_OFFSET+40(%rsp)
129#define keysize 2*15*16(%arg1)
130#endif
131
132
133#define STATE1	%xmm0
134#define STATE2	%xmm4
135#define STATE3	%xmm5
136#define STATE4	%xmm6
137#define STATE	STATE1
138#define IN1	%xmm1
139#define IN2	%xmm7
140#define IN3	%xmm8
141#define IN4	%xmm9
142#define IN	IN1
143#define KEY	%xmm2
144#define IV	%xmm3
145
146#define BSWAP_MASK %xmm10
147#define CTR	%xmm11
148#define INC	%xmm12
149
150#define GF128MUL_MASK %xmm10
151
152#ifdef __x86_64__
153#define AREG	%rax
154#define KEYP	%rdi
155#define OUTP	%rsi
156#define UKEYP	OUTP
157#define INP	%rdx
158#define LEN	%rcx
159#define IVP	%r8
160#define KLEN	%r9d
161#define T1	%r10
162#define TKEYP	T1
163#define T2	%r11
164#define TCTR_LOW T2
165#else
166#define AREG	%eax
167#define KEYP	%edi
168#define OUTP	AREG
169#define UKEYP	OUTP
170#define INP	%edx
171#define LEN	%esi
172#define IVP	%ebp
173#define KLEN	%ebx
174#define T1	%ecx
175#define TKEYP	T1
176#endif
177
178.macro FUNC_SAVE
179	push	%r12
180	push	%r13
181	push	%r14
182#
183# states of %xmm registers %xmm6:%xmm15 not saved
184# all %xmm registers are clobbered
185#
186.endm
187
188
189.macro FUNC_RESTORE
190	pop	%r14
191	pop	%r13
192	pop	%r12
193.endm
194
195# Precompute hashkeys.
196# Input: Hash subkey.
197# Output: HashKeys stored in gcm_context_data.  Only needs to be called
198# once per key.
199# clobbers r12, and tmp xmm registers.
200.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
201	mov	\SUBKEY, %r12
202	movdqu	(%r12), \TMP3
203	movdqa	SHUF_MASK(%rip), \TMP2
204	PSHUFB_XMM \TMP2, \TMP3
205
206	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
207
208	movdqa	\TMP3, \TMP2
209	psllq	$1, \TMP3
210	psrlq	$63, \TMP2
211	movdqa	\TMP2, \TMP1
212	pslldq	$8, \TMP2
213	psrldq	$8, \TMP1
214	por	\TMP2, \TMP3
215
216	# reduce HashKey<<1
217
218	pshufd	$0x24, \TMP1, \TMP2
219	pcmpeqd TWOONE(%rip), \TMP2
220	pand	POLY(%rip), \TMP2
221	pxor	\TMP2, \TMP3
222	movdqu	\TMP3, HashKey(%arg2)
223
224	movdqa	   \TMP3, \TMP5
225	pshufd	   $78, \TMP3, \TMP1
226	pxor	   \TMP3, \TMP1
227	movdqu	   \TMP1, HashKey_k(%arg2)
228
229	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
230# TMP5 = HashKey^2<<1 (mod poly)
231	movdqu	   \TMP5, HashKey_2(%arg2)
232# HashKey_2 = HashKey^2<<1 (mod poly)
233	pshufd	   $78, \TMP5, \TMP1
234	pxor	   \TMP5, \TMP1
235	movdqu	   \TMP1, HashKey_2_k(%arg2)
236
237	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
238# TMP5 = HashKey^3<<1 (mod poly)
239	movdqu	   \TMP5, HashKey_3(%arg2)
240	pshufd	   $78, \TMP5, \TMP1
241	pxor	   \TMP5, \TMP1
242	movdqu	   \TMP1, HashKey_3_k(%arg2)
243
244	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
245# TMP5 = HashKey^3<<1 (mod poly)
246	movdqu	   \TMP5, HashKey_4(%arg2)
247	pshufd	   $78, \TMP5, \TMP1
248	pxor	   \TMP5, \TMP1
249	movdqu	   \TMP1, HashKey_4_k(%arg2)
250.endm
251
252# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
253# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
254.macro GCM_INIT Iv SUBKEY AAD AADLEN
255	mov \AADLEN, %r11
256	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
257	xor %r11d, %r11d
258	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
259	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
260	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
261	mov \Iv, %rax
262	movdqu (%rax), %xmm0
263	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
264
265	movdqa  SHUF_MASK(%rip), %xmm2
266	PSHUFB_XMM %xmm2, %xmm0
267	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
268
269	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
270	movdqu HashKey(%arg2), %xmm13
271
272	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
273	%xmm4, %xmm5, %xmm6
274.endm
275
276# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
277# struct has been initialized by GCM_INIT.
278# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
279# Clobbers rax, r10-r13, and xmm0-xmm15
280.macro GCM_ENC_DEC operation
281	movdqu AadHash(%arg2), %xmm8
282	movdqu HashKey(%arg2), %xmm13
283	add %arg5, InLen(%arg2)
284
285	xor %r11d, %r11d # initialise the data pointer offset as zero
286	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
287
288	sub %r11, %arg5		# sub partial block data used
289	mov %arg5, %r13		# save the number of bytes
290
291	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
292	mov %r13, %r12
293	# Encrypt/Decrypt first few blocks
294
295	and	$(3<<4), %r12
296	jz	_initial_num_blocks_is_0_\@
297	cmp	$(2<<4), %r12
298	jb	_initial_num_blocks_is_1_\@
299	je	_initial_num_blocks_is_2_\@
300_initial_num_blocks_is_3_\@:
301	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
303	sub	$48, %r13
304	jmp	_initial_blocks_\@
305_initial_num_blocks_is_2_\@:
306	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
308	sub	$32, %r13
309	jmp	_initial_blocks_\@
310_initial_num_blocks_is_1_\@:
311	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
313	sub	$16, %r13
314	jmp	_initial_blocks_\@
315_initial_num_blocks_is_0_\@:
316	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
317%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
318_initial_blocks_\@:
319
320	# Main loop - Encrypt/Decrypt remaining blocks
321
322	cmp	$0, %r13
323	je	_zero_cipher_left_\@
324	sub	$64, %r13
325	je	_four_cipher_left_\@
326_crypt_by_4_\@:
327	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
328	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
329	%xmm7, %xmm8, enc
330	add	$64, %r11
331	sub	$64, %r13
332	jne	_crypt_by_4_\@
333_four_cipher_left_\@:
334	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
335%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
336_zero_cipher_left_\@:
337	movdqu %xmm8, AadHash(%arg2)
338	movdqu %xmm0, CurCount(%arg2)
339
340	mov	%arg5, %r13
341	and	$15, %r13			# %r13 = arg5 (mod 16)
342	je	_multiple_of_16_bytes_\@
343
344	mov %r13, PBlockLen(%arg2)
345
346	# Handle the last <16 Byte block separately
347	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
348	movdqu %xmm0, CurCount(%arg2)
349	movdqa SHUF_MASK(%rip), %xmm10
350	PSHUFB_XMM %xmm10, %xmm0
351
352	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
353	movdqu %xmm0, PBlockEncKey(%arg2)
354
355	cmp	$16, %arg5
356	jge _large_enough_update_\@
357
358	lea (%arg4,%r11,1), %r10
359	mov %r13, %r12
360	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
361	jmp _data_read_\@
362
363_large_enough_update_\@:
364	sub	$16, %r11
365	add	%r13, %r11
366
367	# receive the last <16 Byte block
368	movdqu	(%arg4, %r11, 1), %xmm1
369
370	sub	%r13, %r11
371	add	$16, %r11
372
373	lea	SHIFT_MASK+16(%rip), %r12
374	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
375	# (r13 is the number of bytes in plaintext mod 16)
376	sub	%r13, %r12
377	# get the appropriate shuffle mask
378	movdqu	(%r12), %xmm2
379	# shift right 16-r13 bytes
380	PSHUFB_XMM  %xmm2, %xmm1
381
382_data_read_\@:
383	lea ALL_F+16(%rip), %r12
384	sub %r13, %r12
385
386.ifc \operation, dec
387	movdqa  %xmm1, %xmm2
388.endif
389	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
390	movdqu	(%r12), %xmm1
391	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
392	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
393.ifc \operation, dec
394	pand    %xmm1, %xmm2
395	movdqa SHUF_MASK(%rip), %xmm10
396	PSHUFB_XMM %xmm10 ,%xmm2
397
398	pxor %xmm2, %xmm8
399.else
400	movdqa SHUF_MASK(%rip), %xmm10
401	PSHUFB_XMM %xmm10,%xmm0
402
403	pxor	%xmm0, %xmm8
404.endif
405
406	movdqu %xmm8, AadHash(%arg2)
407.ifc \operation, enc
408	# GHASH computation for the last <16 byte block
409	movdqa SHUF_MASK(%rip), %xmm10
410	# shuffle xmm0 back to output as ciphertext
411	PSHUFB_XMM %xmm10, %xmm0
412.endif
413
414	# Output %r13 bytes
415	MOVQ_R64_XMM %xmm0, %rax
416	cmp $8, %r13
417	jle _less_than_8_bytes_left_\@
418	mov %rax, (%arg3 , %r11, 1)
419	add $8, %r11
420	psrldq $8, %xmm0
421	MOVQ_R64_XMM %xmm0, %rax
422	sub $8, %r13
423_less_than_8_bytes_left_\@:
424	mov %al,  (%arg3, %r11, 1)
425	add $1, %r11
426	shr $8, %rax
427	sub $1, %r13
428	jne _less_than_8_bytes_left_\@
429_multiple_of_16_bytes_\@:
430.endm
431
432# GCM_COMPLETE Finishes update of tag of last partial block
433# Output: Authorization Tag (AUTH_TAG)
434# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
435.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
436	movdqu AadHash(%arg2), %xmm8
437	movdqu HashKey(%arg2), %xmm13
438
439	mov PBlockLen(%arg2), %r12
440
441	cmp $0, %r12
442	je _partial_done\@
443
444	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
445
446_partial_done\@:
447	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
448	shl	$3, %r12		  # convert into number of bits
449	movd	%r12d, %xmm15		  # len(A) in %xmm15
450	mov InLen(%arg2), %r12
451	shl     $3, %r12                  # len(C) in bits (*128)
452	MOVQ_R64_XMM    %r12, %xmm1
453
454	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
455	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
456	pxor	%xmm15, %xmm8
457	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
458	# final GHASH computation
459	movdqa SHUF_MASK(%rip), %xmm10
460	PSHUFB_XMM %xmm10, %xmm8
461
462	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
463	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
464	pxor	%xmm8, %xmm0
465_return_T_\@:
466	mov	\AUTHTAG, %r10                     # %r10 = authTag
467	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
468	cmp	$16, %r11
469	je	_T_16_\@
470	cmp	$8, %r11
471	jl	_T_4_\@
472_T_8_\@:
473	MOVQ_R64_XMM	%xmm0, %rax
474	mov	%rax, (%r10)
475	add	$8, %r10
476	sub	$8, %r11
477	psrldq	$8, %xmm0
478	cmp	$0, %r11
479	je	_return_T_done_\@
480_T_4_\@:
481	movd	%xmm0, %eax
482	mov	%eax, (%r10)
483	add	$4, %r10
484	sub	$4, %r11
485	psrldq	$4, %xmm0
486	cmp	$0, %r11
487	je	_return_T_done_\@
488_T_123_\@:
489	movd	%xmm0, %eax
490	cmp	$2, %r11
491	jl	_T_1_\@
492	mov	%ax, (%r10)
493	cmp	$2, %r11
494	je	_return_T_done_\@
495	add	$2, %r10
496	sar	$16, %eax
497_T_1_\@:
498	mov	%al, (%r10)
499	jmp	_return_T_done_\@
500_T_16_\@:
501	movdqu	%xmm0, (%r10)
502_return_T_done_\@:
503.endm
504
505#ifdef __x86_64__
506/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
507*
508*
509* Input: A and B (128-bits each, bit-reflected)
510* Output: C = A*B*x mod poly, (i.e. >>1 )
511* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
512* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
513*
514*/
515.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
516	movdqa	  \GH, \TMP1
517	pshufd	  $78, \GH, \TMP2
518	pshufd	  $78, \HK, \TMP3
519	pxor	  \GH, \TMP2            # TMP2 = a1+a0
520	pxor	  \HK, \TMP3            # TMP3 = b1+b0
521	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
522	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
523	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
524	pxor	  \GH, \TMP2
525	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
526	movdqa	  \TMP2, \TMP3
527	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
528	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
529	pxor	  \TMP3, \GH
530	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
531
532        # first phase of the reduction
533
534	movdqa    \GH, \TMP2
535	movdqa    \GH, \TMP3
536	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
537					# in in order to perform
538					# independent shifts
539	pslld     $31, \TMP2            # packed right shift <<31
540	pslld     $30, \TMP3            # packed right shift <<30
541	pslld     $25, \TMP4            # packed right shift <<25
542	pxor      \TMP3, \TMP2          # xor the shifted versions
543	pxor      \TMP4, \TMP2
544	movdqa    \TMP2, \TMP5
545	psrldq    $4, \TMP5             # right shift TMP5 1 DW
546	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
547	pxor      \TMP2, \GH
548
549        # second phase of the reduction
550
551	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
552					# in in order to perform
553					# independent shifts
554	movdqa    \GH,\TMP3
555	movdqa    \GH,\TMP4
556	psrld     $1,\TMP2              # packed left shift >>1
557	psrld     $2,\TMP3              # packed left shift >>2
558	psrld     $7,\TMP4              # packed left shift >>7
559	pxor      \TMP3,\TMP2		# xor the shifted versions
560	pxor      \TMP4,\TMP2
561	pxor      \TMP5, \TMP2
562	pxor      \TMP2, \GH
563	pxor      \TMP1, \GH            # result is in TMP1
564.endm
565
566# Reads DLEN bytes starting at DPTR and stores in XMMDst
567# where 0 < DLEN < 16
568# Clobbers %rax, DLEN and XMM1
569.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
570        cmp $8, \DLEN
571        jl _read_lt8_\@
572        mov (\DPTR), %rax
573        MOVQ_R64_XMM %rax, \XMMDst
574        sub $8, \DLEN
575        jz _done_read_partial_block_\@
576	xor %eax, %eax
577_read_next_byte_\@:
578        shl $8, %rax
579        mov 7(\DPTR, \DLEN, 1), %al
580        dec \DLEN
581        jnz _read_next_byte_\@
582        MOVQ_R64_XMM %rax, \XMM1
583	pslldq $8, \XMM1
584        por \XMM1, \XMMDst
585	jmp _done_read_partial_block_\@
586_read_lt8_\@:
587	xor %eax, %eax
588_read_next_byte_lt8_\@:
589        shl $8, %rax
590        mov -1(\DPTR, \DLEN, 1), %al
591        dec \DLEN
592        jnz _read_next_byte_lt8_\@
593        MOVQ_R64_XMM %rax, \XMMDst
594_done_read_partial_block_\@:
595.endm
596
597# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
598# clobbers r10-11, xmm14
599.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
600	TMP6 TMP7
601	MOVADQ	   SHUF_MASK(%rip), %xmm14
602	mov	   \AAD, %r10		# %r10 = AAD
603	mov	   \AADLEN, %r11		# %r11 = aadLen
604	pxor	   \TMP7, \TMP7
605	pxor	   \TMP6, \TMP6
606
607	cmp	   $16, %r11
608	jl	   _get_AAD_rest\@
609_get_AAD_blocks\@:
610	movdqu	   (%r10), \TMP7
611	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
612	pxor	   \TMP7, \TMP6
613	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
614	add	   $16, %r10
615	sub	   $16, %r11
616	cmp	   $16, %r11
617	jge	   _get_AAD_blocks\@
618
619	movdqu	   \TMP6, \TMP7
620
621	/* read the last <16B of AAD */
622_get_AAD_rest\@:
623	cmp	   $0, %r11
624	je	   _get_AAD_done\@
625
626	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
627	PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
628	pxor	   \TMP6, \TMP7
629	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
630	movdqu \TMP7, \TMP6
631
632_get_AAD_done\@:
633	movdqu \TMP6, AadHash(%arg2)
634.endm
635
636# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
637# between update calls.
638# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
639# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
640# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
641.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
642	AAD_HASH operation
643	mov 	PBlockLen(%arg2), %r13
644	cmp	$0, %r13
645	je	_partial_block_done_\@	# Leave Macro if no partial blocks
646	# Read in input data without over reading
647	cmp	$16, \PLAIN_CYPH_LEN
648	jl	_fewer_than_16_bytes_\@
649	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
650	jmp	_data_read_\@
651
652_fewer_than_16_bytes_\@:
653	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
654	mov	\PLAIN_CYPH_LEN, %r12
655	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
656
657	mov PBlockLen(%arg2), %r13
658
659_data_read_\@:				# Finished reading in data
660
661	movdqu	PBlockEncKey(%arg2), %xmm9
662	movdqu	HashKey(%arg2), %xmm13
663
664	lea	SHIFT_MASK(%rip), %r12
665
666	# adjust the shuffle mask pointer to be able to shift r13 bytes
667	# r16-r13 is the number of bytes in plaintext mod 16)
668	add	%r13, %r12
669	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
670	PSHUFB_XMM %xmm2, %xmm9		# shift right r13 bytes
671
672.ifc \operation, dec
673	movdqa	%xmm1, %xmm3
674	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
675
676	mov	\PLAIN_CYPH_LEN, %r10
677	add	%r13, %r10
678	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
679	sub	$16, %r10
680	# Determine if if partial block is not being filled and
681	# shift mask accordingly
682	jge	_no_extra_mask_1_\@
683	sub	%r10, %r12
684_no_extra_mask_1_\@:
685
686	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
687	# get the appropriate mask to mask out bottom r13 bytes of xmm9
688	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
689
690	pand	%xmm1, %xmm3
691	movdqa	SHUF_MASK(%rip), %xmm10
692	PSHUFB_XMM	%xmm10, %xmm3
693	PSHUFB_XMM	%xmm2, %xmm3
694	pxor	%xmm3, \AAD_HASH
695
696	cmp	$0, %r10
697	jl	_partial_incomplete_1_\@
698
699	# GHASH computation for the last <16 Byte block
700	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
701	xor	%eax, %eax
702
703	mov	%rax, PBlockLen(%arg2)
704	jmp	_dec_done_\@
705_partial_incomplete_1_\@:
706	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
707_dec_done_\@:
708	movdqu	\AAD_HASH, AadHash(%arg2)
709.else
710	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
711
712	mov	\PLAIN_CYPH_LEN, %r10
713	add	%r13, %r10
714	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
715	sub	$16, %r10
716	# Determine if if partial block is not being filled and
717	# shift mask accordingly
718	jge	_no_extra_mask_2_\@
719	sub	%r10, %r12
720_no_extra_mask_2_\@:
721
722	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
723	# get the appropriate mask to mask out bottom r13 bytes of xmm9
724	pand	%xmm1, %xmm9
725
726	movdqa	SHUF_MASK(%rip), %xmm1
727	PSHUFB_XMM %xmm1, %xmm9
728	PSHUFB_XMM %xmm2, %xmm9
729	pxor	%xmm9, \AAD_HASH
730
731	cmp	$0, %r10
732	jl	_partial_incomplete_2_\@
733
734	# GHASH computation for the last <16 Byte block
735	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
736	xor	%eax, %eax
737
738	mov	%rax, PBlockLen(%arg2)
739	jmp	_encode_done_\@
740_partial_incomplete_2_\@:
741	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
742_encode_done_\@:
743	movdqu	\AAD_HASH, AadHash(%arg2)
744
745	movdqa	SHUF_MASK(%rip), %xmm10
746	# shuffle xmm9 back to output as ciphertext
747	PSHUFB_XMM	%xmm10, %xmm9
748	PSHUFB_XMM	%xmm2, %xmm9
749.endif
750	# output encrypted Bytes
751	cmp	$0, %r10
752	jl	_partial_fill_\@
753	mov	%r13, %r12
754	mov	$16, %r13
755	# Set r13 to be the number of bytes to write out
756	sub	%r12, %r13
757	jmp	_count_set_\@
758_partial_fill_\@:
759	mov	\PLAIN_CYPH_LEN, %r13
760_count_set_\@:
761	movdqa	%xmm9, %xmm0
762	MOVQ_R64_XMM	%xmm0, %rax
763	cmp	$8, %r13
764	jle	_less_than_8_bytes_left_\@
765
766	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
767	add	$8, \DATA_OFFSET
768	psrldq	$8, %xmm0
769	MOVQ_R64_XMM	%xmm0, %rax
770	sub	$8, %r13
771_less_than_8_bytes_left_\@:
772	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
773	add	$1, \DATA_OFFSET
774	shr	$8, %rax
775	sub	$1, %r13
776	jne	_less_than_8_bytes_left_\@
777_partial_block_done_\@:
778.endm # PARTIAL_BLOCK
779
780/*
781* if a = number of total plaintext bytes
782* b = floor(a/16)
783* num_initial_blocks = b mod 4
784* encrypt the initial num_initial_blocks blocks and apply ghash on
785* the ciphertext
786* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
787* are clobbered
788* arg1, %arg2, %arg3 are used as a pointer only, not modified
789*/
790
791
792.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
793	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
794	MOVADQ		SHUF_MASK(%rip), %xmm14
795
796	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
797
798	# start AES for num_initial_blocks blocks
799
800	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
801
802.if (\i == 5) || (\i == 6) || (\i == 7)
803
804	MOVADQ		ONE(%RIP),\TMP1
805	MOVADQ		0(%arg1),\TMP2
806.irpc index, \i_seq
807	paddd		\TMP1, \XMM0                 # INCR Y0
808.ifc \operation, dec
809        movdqa     \XMM0, %xmm\index
810.else
811	MOVADQ		\XMM0, %xmm\index
812.endif
813	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
814	pxor		\TMP2, %xmm\index
815.endr
816	lea	0x10(%arg1),%r10
817	mov	keysize,%eax
818	shr	$2,%eax				# 128->4, 192->6, 256->8
819	add	$5,%eax			      # 128->9, 192->11, 256->13
820
821aes_loop_initial_\@:
822	MOVADQ	(%r10),\TMP1
823.irpc	index, \i_seq
824	AESENC	\TMP1, %xmm\index
825.endr
826	add	$16,%r10
827	sub	$1,%eax
828	jnz	aes_loop_initial_\@
829
830	MOVADQ	(%r10), \TMP1
831.irpc index, \i_seq
832	AESENCLAST \TMP1, %xmm\index         # Last Round
833.endr
834.irpc index, \i_seq
835	movdqu	   (%arg4 , %r11, 1), \TMP1
836	pxor	   \TMP1, %xmm\index
837	movdqu	   %xmm\index, (%arg3 , %r11, 1)
838	# write back plaintext/ciphertext for num_initial_blocks
839	add	   $16, %r11
840
841.ifc \operation, dec
842	movdqa     \TMP1, %xmm\index
843.endif
844	PSHUFB_XMM	   %xmm14, %xmm\index
845
846		# prepare plaintext/ciphertext for GHASH computation
847.endr
848.endif
849
850        # apply GHASH on num_initial_blocks blocks
851
852.if \i == 5
853        pxor       %xmm5, %xmm6
854	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
855        pxor       %xmm6, %xmm7
856	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857        pxor       %xmm7, %xmm8
858	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859.elseif \i == 6
860        pxor       %xmm6, %xmm7
861	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862        pxor       %xmm7, %xmm8
863	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
864.elseif \i == 7
865        pxor       %xmm7, %xmm8
866	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
867.endif
868	cmp	   $64, %r13
869	jl	_initial_blocks_done\@
870	# no need for precomputed values
871/*
872*
873* Precomputations for HashKey parallel with encryption of first 4 blocks.
874* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
875*/
876	MOVADQ	   ONE(%RIP),\TMP1
877	paddd	   \TMP1, \XMM0              # INCR Y0
878	MOVADQ	   \XMM0, \XMM1
879	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
880
881	paddd	   \TMP1, \XMM0              # INCR Y0
882	MOVADQ	   \XMM0, \XMM2
883	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
884
885	paddd	   \TMP1, \XMM0              # INCR Y0
886	MOVADQ	   \XMM0, \XMM3
887	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
888
889	paddd	   \TMP1, \XMM0              # INCR Y0
890	MOVADQ	   \XMM0, \XMM4
891	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
892
893	MOVADQ	   0(%arg1),\TMP1
894	pxor	   \TMP1, \XMM1
895	pxor	   \TMP1, \XMM2
896	pxor	   \TMP1, \XMM3
897	pxor	   \TMP1, \XMM4
898.irpc index, 1234 # do 4 rounds
899	movaps 0x10*\index(%arg1), \TMP1
900	AESENC	   \TMP1, \XMM1
901	AESENC	   \TMP1, \XMM2
902	AESENC	   \TMP1, \XMM3
903	AESENC	   \TMP1, \XMM4
904.endr
905.irpc index, 56789 # do next 5 rounds
906	movaps 0x10*\index(%arg1), \TMP1
907	AESENC	   \TMP1, \XMM1
908	AESENC	   \TMP1, \XMM2
909	AESENC	   \TMP1, \XMM3
910	AESENC	   \TMP1, \XMM4
911.endr
912	lea	   0xa0(%arg1),%r10
913	mov	   keysize,%eax
914	shr	   $2,%eax			# 128->4, 192->6, 256->8
915	sub	   $4,%eax			# 128->0, 192->2, 256->4
916	jz	   aes_loop_pre_done\@
917
918aes_loop_pre_\@:
919	MOVADQ	   (%r10),\TMP2
920.irpc	index, 1234
921	AESENC	   \TMP2, %xmm\index
922.endr
923	add	   $16,%r10
924	sub	   $1,%eax
925	jnz	   aes_loop_pre_\@
926
927aes_loop_pre_done\@:
928	MOVADQ	   (%r10), \TMP2
929	AESENCLAST \TMP2, \XMM1
930	AESENCLAST \TMP2, \XMM2
931	AESENCLAST \TMP2, \XMM3
932	AESENCLAST \TMP2, \XMM4
933	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
934	pxor	   \TMP1, \XMM1
935.ifc \operation, dec
936	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
937	movdqa     \TMP1, \XMM1
938.endif
939	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
940	pxor	   \TMP1, \XMM2
941.ifc \operation, dec
942	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
943	movdqa     \TMP1, \XMM2
944.endif
945	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
946	pxor	   \TMP1, \XMM3
947.ifc \operation, dec
948	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
949	movdqa     \TMP1, \XMM3
950.endif
951	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
952	pxor	   \TMP1, \XMM4
953.ifc \operation, dec
954	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
955	movdqa     \TMP1, \XMM4
956.else
957	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
958	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
959	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
960	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
961.endif
962
963	add	   $64, %r11
964	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
965	pxor	   \XMMDst, \XMM1
966# combine GHASHed value with the corresponding ciphertext
967	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
968	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
969	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
970
971_initial_blocks_done\@:
972
973.endm
974
975/*
976* encrypt 4 blocks at a time
977* ghash the 4 previously encrypted ciphertext blocks
978* arg1, %arg3, %arg4 are used as pointers only, not modified
979* %r11 is the data offset value
980*/
981.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
982TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
983
984	movdqa	  \XMM1, \XMM5
985	movdqa	  \XMM2, \XMM6
986	movdqa	  \XMM3, \XMM7
987	movdqa	  \XMM4, \XMM8
988
989        movdqa    SHUF_MASK(%rip), %xmm15
990        # multiply TMP5 * HashKey using karatsuba
991
992	movdqa	  \XMM5, \TMP4
993	pshufd	  $78, \XMM5, \TMP6
994	pxor	  \XMM5, \TMP6
995	paddd     ONE(%rip), \XMM0		# INCR CNT
996	movdqu	  HashKey_4(%arg2), \TMP5
997	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
998	movdqa    \XMM0, \XMM1
999	paddd     ONE(%rip), \XMM0		# INCR CNT
1000	movdqa    \XMM0, \XMM2
1001	paddd     ONE(%rip), \XMM0		# INCR CNT
1002	movdqa    \XMM0, \XMM3
1003	paddd     ONE(%rip), \XMM0		# INCR CNT
1004	movdqa    \XMM0, \XMM4
1005	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1006	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1007	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1008	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1009	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1010
1011	pxor	  (%arg1), \XMM1
1012	pxor	  (%arg1), \XMM2
1013	pxor	  (%arg1), \XMM3
1014	pxor	  (%arg1), \XMM4
1015	movdqu	  HashKey_4_k(%arg2), \TMP5
1016	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1017	movaps 0x10(%arg1), \TMP1
1018	AESENC	  \TMP1, \XMM1              # Round 1
1019	AESENC	  \TMP1, \XMM2
1020	AESENC	  \TMP1, \XMM3
1021	AESENC	  \TMP1, \XMM4
1022	movaps 0x20(%arg1), \TMP1
1023	AESENC	  \TMP1, \XMM1              # Round 2
1024	AESENC	  \TMP1, \XMM2
1025	AESENC	  \TMP1, \XMM3
1026	AESENC	  \TMP1, \XMM4
1027	movdqa	  \XMM6, \TMP1
1028	pshufd	  $78, \XMM6, \TMP2
1029	pxor	  \XMM6, \TMP2
1030	movdqu	  HashKey_3(%arg2), \TMP5
1031	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1032	movaps 0x30(%arg1), \TMP3
1033	AESENC    \TMP3, \XMM1              # Round 3
1034	AESENC    \TMP3, \XMM2
1035	AESENC    \TMP3, \XMM3
1036	AESENC    \TMP3, \XMM4
1037	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1038	movaps 0x40(%arg1), \TMP3
1039	AESENC	  \TMP3, \XMM1              # Round 4
1040	AESENC	  \TMP3, \XMM2
1041	AESENC	  \TMP3, \XMM3
1042	AESENC	  \TMP3, \XMM4
1043	movdqu	  HashKey_3_k(%arg2), \TMP5
1044	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1045	movaps 0x50(%arg1), \TMP3
1046	AESENC	  \TMP3, \XMM1              # Round 5
1047	AESENC	  \TMP3, \XMM2
1048	AESENC	  \TMP3, \XMM3
1049	AESENC	  \TMP3, \XMM4
1050	pxor	  \TMP1, \TMP4
1051# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1052	pxor	  \XMM6, \XMM5
1053	pxor	  \TMP2, \TMP6
1054	movdqa	  \XMM7, \TMP1
1055	pshufd	  $78, \XMM7, \TMP2
1056	pxor	  \XMM7, \TMP2
1057	movdqu	  HashKey_2(%arg2), \TMP5
1058
1059        # Multiply TMP5 * HashKey using karatsuba
1060
1061	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1062	movaps 0x60(%arg1), \TMP3
1063	AESENC	  \TMP3, \XMM1              # Round 6
1064	AESENC	  \TMP3, \XMM2
1065	AESENC	  \TMP3, \XMM3
1066	AESENC	  \TMP3, \XMM4
1067	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1068	movaps 0x70(%arg1), \TMP3
1069	AESENC	  \TMP3, \XMM1             # Round 7
1070	AESENC	  \TMP3, \XMM2
1071	AESENC	  \TMP3, \XMM3
1072	AESENC	  \TMP3, \XMM4
1073	movdqu	  HashKey_2_k(%arg2), \TMP5
1074	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1075	movaps 0x80(%arg1), \TMP3
1076	AESENC	  \TMP3, \XMM1             # Round 8
1077	AESENC	  \TMP3, \XMM2
1078	AESENC	  \TMP3, \XMM3
1079	AESENC	  \TMP3, \XMM4
1080	pxor	  \TMP1, \TMP4
1081# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1082	pxor	  \XMM7, \XMM5
1083	pxor	  \TMP2, \TMP6
1084
1085        # Multiply XMM8 * HashKey
1086        # XMM8 and TMP5 hold the values for the two operands
1087
1088	movdqa	  \XMM8, \TMP1
1089	pshufd	  $78, \XMM8, \TMP2
1090	pxor	  \XMM8, \TMP2
1091	movdqu	  HashKey(%arg2), \TMP5
1092	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1093	movaps 0x90(%arg1), \TMP3
1094	AESENC	  \TMP3, \XMM1            # Round 9
1095	AESENC	  \TMP3, \XMM2
1096	AESENC	  \TMP3, \XMM3
1097	AESENC	  \TMP3, \XMM4
1098	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1099	lea	  0xa0(%arg1),%r10
1100	mov	  keysize,%eax
1101	shr	  $2,%eax			# 128->4, 192->6, 256->8
1102	sub	  $4,%eax			# 128->0, 192->2, 256->4
1103	jz	  aes_loop_par_enc_done\@
1104
1105aes_loop_par_enc\@:
1106	MOVADQ	  (%r10),\TMP3
1107.irpc	index, 1234
1108	AESENC	  \TMP3, %xmm\index
1109.endr
1110	add	  $16,%r10
1111	sub	  $1,%eax
1112	jnz	  aes_loop_par_enc\@
1113
1114aes_loop_par_enc_done\@:
1115	MOVADQ	  (%r10), \TMP3
1116	AESENCLAST \TMP3, \XMM1           # Round 10
1117	AESENCLAST \TMP3, \XMM2
1118	AESENCLAST \TMP3, \XMM3
1119	AESENCLAST \TMP3, \XMM4
1120	movdqu    HashKey_k(%arg2), \TMP5
1121	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1122	movdqu	  (%arg4,%r11,1), \TMP3
1123	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1124	movdqu	  16(%arg4,%r11,1), \TMP3
1125	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1126	movdqu	  32(%arg4,%r11,1), \TMP3
1127	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1128	movdqu	  48(%arg4,%r11,1), \TMP3
1129	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1130        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1131        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1132        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1133        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1134	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1135	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1136	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1137	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1138
1139	pxor	  \TMP4, \TMP1
1140	pxor	  \XMM8, \XMM5
1141	pxor	  \TMP6, \TMP2
1142	pxor	  \TMP1, \TMP2
1143	pxor	  \XMM5, \TMP2
1144	movdqa	  \TMP2, \TMP3
1145	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1146	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1147	pxor	  \TMP3, \XMM5
1148	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1149
1150        # first phase of reduction
1151
1152	movdqa    \XMM5, \TMP2
1153	movdqa    \XMM5, \TMP3
1154	movdqa    \XMM5, \TMP4
1155# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156	pslld     $31, \TMP2                   # packed right shift << 31
1157	pslld     $30, \TMP3                   # packed right shift << 30
1158	pslld     $25, \TMP4                   # packed right shift << 25
1159	pxor      \TMP3, \TMP2	               # xor the shifted versions
1160	pxor      \TMP4, \TMP2
1161	movdqa    \TMP2, \TMP5
1162	psrldq    $4, \TMP5                    # right shift T5 1 DW
1163	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1164	pxor      \TMP2, \XMM5
1165
1166        # second phase of reduction
1167
1168	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1169	movdqa    \XMM5,\TMP3
1170	movdqa    \XMM5,\TMP4
1171	psrld     $1, \TMP2                    # packed left shift >>1
1172	psrld     $2, \TMP3                    # packed left shift >>2
1173	psrld     $7, \TMP4                    # packed left shift >>7
1174	pxor      \TMP3,\TMP2		       # xor the shifted versions
1175	pxor      \TMP4,\TMP2
1176	pxor      \TMP5, \TMP2
1177	pxor      \TMP2, \XMM5
1178	pxor      \TMP1, \XMM5                 # result is in TMP1
1179
1180	pxor	  \XMM5, \XMM1
1181.endm
1182
1183/*
1184* decrypt 4 blocks at a time
1185* ghash the 4 previously decrypted ciphertext blocks
1186* arg1, %arg3, %arg4 are used as pointers only, not modified
1187* %r11 is the data offset value
1188*/
1189.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1190TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1191
1192	movdqa	  \XMM1, \XMM5
1193	movdqa	  \XMM2, \XMM6
1194	movdqa	  \XMM3, \XMM7
1195	movdqa	  \XMM4, \XMM8
1196
1197        movdqa    SHUF_MASK(%rip), %xmm15
1198        # multiply TMP5 * HashKey using karatsuba
1199
1200	movdqa	  \XMM5, \TMP4
1201	pshufd	  $78, \XMM5, \TMP6
1202	pxor	  \XMM5, \TMP6
1203	paddd     ONE(%rip), \XMM0		# INCR CNT
1204	movdqu	  HashKey_4(%arg2), \TMP5
1205	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1206	movdqa    \XMM0, \XMM1
1207	paddd     ONE(%rip), \XMM0		# INCR CNT
1208	movdqa    \XMM0, \XMM2
1209	paddd     ONE(%rip), \XMM0		# INCR CNT
1210	movdqa    \XMM0, \XMM3
1211	paddd     ONE(%rip), \XMM0		# INCR CNT
1212	movdqa    \XMM0, \XMM4
1213	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
1214	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1215	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1216	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1217	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1218
1219	pxor	  (%arg1), \XMM1
1220	pxor	  (%arg1), \XMM2
1221	pxor	  (%arg1), \XMM3
1222	pxor	  (%arg1), \XMM4
1223	movdqu	  HashKey_4_k(%arg2), \TMP5
1224	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1225	movaps 0x10(%arg1), \TMP1
1226	AESENC	  \TMP1, \XMM1              # Round 1
1227	AESENC	  \TMP1, \XMM2
1228	AESENC	  \TMP1, \XMM3
1229	AESENC	  \TMP1, \XMM4
1230	movaps 0x20(%arg1), \TMP1
1231	AESENC	  \TMP1, \XMM1              # Round 2
1232	AESENC	  \TMP1, \XMM2
1233	AESENC	  \TMP1, \XMM3
1234	AESENC	  \TMP1, \XMM4
1235	movdqa	  \XMM6, \TMP1
1236	pshufd	  $78, \XMM6, \TMP2
1237	pxor	  \XMM6, \TMP2
1238	movdqu	  HashKey_3(%arg2), \TMP5
1239	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1240	movaps 0x30(%arg1), \TMP3
1241	AESENC    \TMP3, \XMM1              # Round 3
1242	AESENC    \TMP3, \XMM2
1243	AESENC    \TMP3, \XMM3
1244	AESENC    \TMP3, \XMM4
1245	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1246	movaps 0x40(%arg1), \TMP3
1247	AESENC	  \TMP3, \XMM1              # Round 4
1248	AESENC	  \TMP3, \XMM2
1249	AESENC	  \TMP3, \XMM3
1250	AESENC	  \TMP3, \XMM4
1251	movdqu	  HashKey_3_k(%arg2), \TMP5
1252	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1253	movaps 0x50(%arg1), \TMP3
1254	AESENC	  \TMP3, \XMM1              # Round 5
1255	AESENC	  \TMP3, \XMM2
1256	AESENC	  \TMP3, \XMM3
1257	AESENC	  \TMP3, \XMM4
1258	pxor	  \TMP1, \TMP4
1259# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1260	pxor	  \XMM6, \XMM5
1261	pxor	  \TMP2, \TMP6
1262	movdqa	  \XMM7, \TMP1
1263	pshufd	  $78, \XMM7, \TMP2
1264	pxor	  \XMM7, \TMP2
1265	movdqu	  HashKey_2(%arg2), \TMP5
1266
1267        # Multiply TMP5 * HashKey using karatsuba
1268
1269	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1270	movaps 0x60(%arg1), \TMP3
1271	AESENC	  \TMP3, \XMM1              # Round 6
1272	AESENC	  \TMP3, \XMM2
1273	AESENC	  \TMP3, \XMM3
1274	AESENC	  \TMP3, \XMM4
1275	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1276	movaps 0x70(%arg1), \TMP3
1277	AESENC	  \TMP3, \XMM1             # Round 7
1278	AESENC	  \TMP3, \XMM2
1279	AESENC	  \TMP3, \XMM3
1280	AESENC	  \TMP3, \XMM4
1281	movdqu	  HashKey_2_k(%arg2), \TMP5
1282	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1283	movaps 0x80(%arg1), \TMP3
1284	AESENC	  \TMP3, \XMM1             # Round 8
1285	AESENC	  \TMP3, \XMM2
1286	AESENC	  \TMP3, \XMM3
1287	AESENC	  \TMP3, \XMM4
1288	pxor	  \TMP1, \TMP4
1289# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1290	pxor	  \XMM7, \XMM5
1291	pxor	  \TMP2, \TMP6
1292
1293        # Multiply XMM8 * HashKey
1294        # XMM8 and TMP5 hold the values for the two operands
1295
1296	movdqa	  \XMM8, \TMP1
1297	pshufd	  $78, \XMM8, \TMP2
1298	pxor	  \XMM8, \TMP2
1299	movdqu	  HashKey(%arg2), \TMP5
1300	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1301	movaps 0x90(%arg1), \TMP3
1302	AESENC	  \TMP3, \XMM1            # Round 9
1303	AESENC	  \TMP3, \XMM2
1304	AESENC	  \TMP3, \XMM3
1305	AESENC	  \TMP3, \XMM4
1306	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1307	lea	  0xa0(%arg1),%r10
1308	mov	  keysize,%eax
1309	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1310	sub	  $4,%eax			# 128->0, 192->2, 256->4
1311	jz	  aes_loop_par_dec_done\@
1312
1313aes_loop_par_dec\@:
1314	MOVADQ	  (%r10),\TMP3
1315.irpc	index, 1234
1316	AESENC	  \TMP3, %xmm\index
1317.endr
1318	add	  $16,%r10
1319	sub	  $1,%eax
1320	jnz	  aes_loop_par_dec\@
1321
1322aes_loop_par_dec_done\@:
1323	MOVADQ	  (%r10), \TMP3
1324	AESENCLAST \TMP3, \XMM1           # last round
1325	AESENCLAST \TMP3, \XMM2
1326	AESENCLAST \TMP3, \XMM3
1327	AESENCLAST \TMP3, \XMM4
1328	movdqu    HashKey_k(%arg2), \TMP5
1329	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1330	movdqu	  (%arg4,%r11,1), \TMP3
1331	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1332	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1333	movdqa    \TMP3, \XMM1
1334	movdqu	  16(%arg4,%r11,1), \TMP3
1335	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1336	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1337	movdqa    \TMP3, \XMM2
1338	movdqu	  32(%arg4,%r11,1), \TMP3
1339	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1340	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1341	movdqa    \TMP3, \XMM3
1342	movdqu	  48(%arg4,%r11,1), \TMP3
1343	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1344	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1345	movdqa    \TMP3, \XMM4
1346	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1347	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
1348	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
1349	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
1350
1351	pxor	  \TMP4, \TMP1
1352	pxor	  \XMM8, \XMM5
1353	pxor	  \TMP6, \TMP2
1354	pxor	  \TMP1, \TMP2
1355	pxor	  \XMM5, \TMP2
1356	movdqa	  \TMP2, \TMP3
1357	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1358	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1359	pxor	  \TMP3, \XMM5
1360	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1361
1362        # first phase of reduction
1363
1364	movdqa    \XMM5, \TMP2
1365	movdqa    \XMM5, \TMP3
1366	movdqa    \XMM5, \TMP4
1367# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368	pslld     $31, \TMP2                   # packed right shift << 31
1369	pslld     $30, \TMP3                   # packed right shift << 30
1370	pslld     $25, \TMP4                   # packed right shift << 25
1371	pxor      \TMP3, \TMP2	               # xor the shifted versions
1372	pxor      \TMP4, \TMP2
1373	movdqa    \TMP2, \TMP5
1374	psrldq    $4, \TMP5                    # right shift T5 1 DW
1375	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1376	pxor      \TMP2, \XMM5
1377
1378        # second phase of reduction
1379
1380	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1381	movdqa    \XMM5,\TMP3
1382	movdqa    \XMM5,\TMP4
1383	psrld     $1, \TMP2                    # packed left shift >>1
1384	psrld     $2, \TMP3                    # packed left shift >>2
1385	psrld     $7, \TMP4                    # packed left shift >>7
1386	pxor      \TMP3,\TMP2		       # xor the shifted versions
1387	pxor      \TMP4,\TMP2
1388	pxor      \TMP5, \TMP2
1389	pxor      \TMP2, \XMM5
1390	pxor      \TMP1, \XMM5                 # result is in TMP1
1391
1392	pxor	  \XMM5, \XMM1
1393.endm
1394
1395/* GHASH the last 4 ciphertext blocks. */
1396.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398
1399        # Multiply TMP6 * HashKey (using Karatsuba)
1400
1401	movdqa	  \XMM1, \TMP6
1402	pshufd	  $78, \XMM1, \TMP2
1403	pxor	  \XMM1, \TMP2
1404	movdqu	  HashKey_4(%arg2), \TMP5
1405	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1406	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1407	movdqu	  HashKey_4_k(%arg2), \TMP4
1408	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1409	movdqa	  \XMM1, \XMMDst
1410	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1411
1412        # Multiply TMP1 * HashKey (using Karatsuba)
1413
1414	movdqa	  \XMM2, \TMP1
1415	pshufd	  $78, \XMM2, \TMP2
1416	pxor	  \XMM2, \TMP2
1417	movdqu	  HashKey_3(%arg2), \TMP5
1418	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1419	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1420	movdqu	  HashKey_3_k(%arg2), \TMP4
1421	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1422	pxor	  \TMP1, \TMP6
1423	pxor	  \XMM2, \XMMDst
1424	pxor	  \TMP2, \XMM1
1425# results accumulated in TMP6, XMMDst, XMM1
1426
1427        # Multiply TMP1 * HashKey (using Karatsuba)
1428
1429	movdqa	  \XMM3, \TMP1
1430	pshufd	  $78, \XMM3, \TMP2
1431	pxor	  \XMM3, \TMP2
1432	movdqu	  HashKey_2(%arg2), \TMP5
1433	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1434	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1435	movdqu	  HashKey_2_k(%arg2), \TMP4
1436	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1437	pxor	  \TMP1, \TMP6
1438	pxor	  \XMM3, \XMMDst
1439	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1440
1441        # Multiply TMP1 * HashKey (using Karatsuba)
1442	movdqa	  \XMM4, \TMP1
1443	pshufd	  $78, \XMM4, \TMP2
1444	pxor	  \XMM4, \TMP2
1445	movdqu	  HashKey(%arg2), \TMP5
1446	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1447	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1448	movdqu	  HashKey_k(%arg2), \TMP4
1449	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1450	pxor	  \TMP1, \TMP6
1451	pxor	  \XMM4, \XMMDst
1452	pxor	  \XMM1, \TMP2
1453	pxor	  \TMP6, \TMP2
1454	pxor	  \XMMDst, \TMP2
1455	# middle section of the temp results combined as in karatsuba algorithm
1456	movdqa	  \TMP2, \TMP4
1457	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1458	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1459	pxor	  \TMP4, \XMMDst
1460	pxor	  \TMP2, \TMP6
1461# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462	# first phase of the reduction
1463	movdqa    \XMMDst, \TMP2
1464	movdqa    \XMMDst, \TMP3
1465	movdqa    \XMMDst, \TMP4
1466# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467	pslld     $31, \TMP2                # packed right shifting << 31
1468	pslld     $30, \TMP3                # packed right shifting << 30
1469	pslld     $25, \TMP4                # packed right shifting << 25
1470	pxor      \TMP3, \TMP2              # xor the shifted versions
1471	pxor      \TMP4, \TMP2
1472	movdqa    \TMP2, \TMP7
1473	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1474	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1475	pxor      \TMP2, \XMMDst
1476
1477        # second phase of the reduction
1478	movdqa    \XMMDst, \TMP2
1479	# make 3 copies of XMMDst for doing 3 shift operations
1480	movdqa    \XMMDst, \TMP3
1481	movdqa    \XMMDst, \TMP4
1482	psrld     $1, \TMP2                 # packed left shift >> 1
1483	psrld     $2, \TMP3                 # packed left shift >> 2
1484	psrld     $7, \TMP4                 # packed left shift >> 7
1485	pxor      \TMP3, \TMP2              # xor the shifted versions
1486	pxor      \TMP4, \TMP2
1487	pxor      \TMP7, \TMP2
1488	pxor      \TMP2, \XMMDst
1489	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1490.endm
1491
1492
1493/* Encryption of a single block
1494* uses eax & r10
1495*/
1496
1497.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1498
1499	pxor		(%arg1), \XMM0
1500	mov		keysize,%eax
1501	shr		$2,%eax			# 128->4, 192->6, 256->8
1502	add		$5,%eax			# 128->9, 192->11, 256->13
1503	lea		16(%arg1), %r10	  # get first expanded key address
1504
1505_esb_loop_\@:
1506	MOVADQ		(%r10),\TMP1
1507	AESENC		\TMP1,\XMM0
1508	add		$16,%r10
1509	sub		$1,%eax
1510	jnz		_esb_loop_\@
1511
1512	MOVADQ		(%r10),\TMP1
1513	AESENCLAST	\TMP1,\XMM0
1514.endm
1515/*****************************************************************************
1516* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1517*                   struct gcm_context_data *data
1518*                                      // Context data
1519*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1520*                   const u8 *in,      // Ciphertext input
1521*                   u64 plaintext_len, // Length of data in bytes for decryption.
1522*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1523*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1525*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526*                   const u8 *aad,     // Additional Authentication Data (AAD)
1527*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1529*                                      // given authentication tag and only return the plaintext if they match.
1530*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531*                                      // (most likely), 12 or 8.
1532*
1533* Assumptions:
1534*
1535* keys:
1536*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1537*       set of 11 keys in the data structure void *aes_ctx
1538*
1539* iv:
1540*       0                   1                   2                   3
1541*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543*       |                             Salt  (From the SA)               |
1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*       |                     Initialization Vector                     |
1546*       |         (This is the sequence number from IPSec header)       |
1547*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548*       |                              0x1                              |
1549*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550*
1551*
1552*
1553* AAD:
1554*       AAD padded to 128 bits with 0
1555*       for example, assume AAD is a u32 vector
1556*
1557*       if AAD is 8 bytes:
1558*       AAD[3] = {A0, A1};
1559*       padded AAD in xmm register = {A1 A0 0 0}
1560*
1561*       0                   1                   2                   3
1562*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564*       |                               SPI (A1)                        |
1565*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566*       |                     32-bit Sequence Number (A0)               |
1567*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568*       |                              0x0                              |
1569*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*
1571*                                       AAD Format with 32-bit Sequence Number
1572*
1573*       if AAD is 12 bytes:
1574*       AAD[3] = {A0, A1, A2};
1575*       padded AAD in xmm register = {A2 A1 A0 0}
1576*
1577*       0                   1                   2                   3
1578*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582*       |                               SPI (A2)                        |
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*       |                 64-bit Extended Sequence Number {A1,A0}       |
1585*       |                                                               |
1586*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587*       |                              0x0                              |
1588*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589*
1590*                        AAD Format with 64-bit Extended Sequence Number
1591*
1592* poly = x^128 + x^127 + x^126 + x^121 + 1
1593*
1594*****************************************************************************/
1595ENTRY(aesni_gcm_dec)
1596	FUNC_SAVE
1597
1598	GCM_INIT %arg6, arg7, arg8, arg9
1599	GCM_ENC_DEC dec
1600	GCM_COMPLETE arg10, arg11
1601	FUNC_RESTORE
1602	ret
1603ENDPROC(aesni_gcm_dec)
1604
1605
1606/*****************************************************************************
1607* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1608*                    struct gcm_context_data *data
1609*                                        // Context data
1610*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1611*                    const u8 *in,       // Plaintext input
1612*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1613*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1614*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1616*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617*                    const u8 *aad,      // Additional Authentication Data (AAD)
1618*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619*                    u8 *auth_tag,       // Authenticated Tag output.
1620*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621*                                        // 12 or 8.
1622*
1623* Assumptions:
1624*
1625* keys:
1626*       keys are pre-expanded and aligned to 16 bytes. we are using the
1627*       first set of 11 keys in the data structure void *aes_ctx
1628*
1629*
1630* iv:
1631*       0                   1                   2                   3
1632*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634*       |                             Salt  (From the SA)               |
1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*       |                     Initialization Vector                     |
1637*       |         (This is the sequence number from IPSec header)       |
1638*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639*       |                              0x1                              |
1640*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641*
1642*
1643*
1644* AAD:
1645*       AAD padded to 128 bits with 0
1646*       for example, assume AAD is a u32 vector
1647*
1648*       if AAD is 8 bytes:
1649*       AAD[3] = {A0, A1};
1650*       padded AAD in xmm register = {A1 A0 0 0}
1651*
1652*       0                   1                   2                   3
1653*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655*       |                               SPI (A1)                        |
1656*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657*       |                     32-bit Sequence Number (A0)               |
1658*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659*       |                              0x0                              |
1660*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*
1662*                                 AAD Format with 32-bit Sequence Number
1663*
1664*       if AAD is 12 bytes:
1665*       AAD[3] = {A0, A1, A2};
1666*       padded AAD in xmm register = {A2 A1 A0 0}
1667*
1668*       0                   1                   2                   3
1669*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671*       |                               SPI (A2)                        |
1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*       |                 64-bit Extended Sequence Number {A1,A0}       |
1674*       |                                                               |
1675*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676*       |                              0x0                              |
1677*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678*
1679*                         AAD Format with 64-bit Extended Sequence Number
1680*
1681* poly = x^128 + x^127 + x^126 + x^121 + 1
1682***************************************************************************/
1683ENTRY(aesni_gcm_enc)
1684	FUNC_SAVE
1685
1686	GCM_INIT %arg6, arg7, arg8, arg9
1687	GCM_ENC_DEC enc
1688
1689	GCM_COMPLETE arg10, arg11
1690	FUNC_RESTORE
1691	ret
1692ENDPROC(aesni_gcm_enc)
1693
1694/*****************************************************************************
1695* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1696*                     struct gcm_context_data *data,
1697*                                         // context data
1698*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1699*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1701*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702*                     const u8 *aad,      // Additional Authentication Data (AAD)
1703*                     u64 aad_len)        // Length of AAD in bytes.
1704*/
1705ENTRY(aesni_gcm_init)
1706	FUNC_SAVE
1707	GCM_INIT %arg3, %arg4,%arg5, %arg6
1708	FUNC_RESTORE
1709	ret
1710ENDPROC(aesni_gcm_init)
1711
1712/*****************************************************************************
1713* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1714*                    struct gcm_context_data *data,
1715*                                        // context data
1716*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1717*                    const u8 *in,       // Plaintext input
1718*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1719*/
1720ENTRY(aesni_gcm_enc_update)
1721	FUNC_SAVE
1722	GCM_ENC_DEC enc
1723	FUNC_RESTORE
1724	ret
1725ENDPROC(aesni_gcm_enc_update)
1726
1727/*****************************************************************************
1728* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1729*                    struct gcm_context_data *data,
1730*                                        // context data
1731*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1732*                    const u8 *in,       // Plaintext input
1733*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1734*/
1735ENTRY(aesni_gcm_dec_update)
1736	FUNC_SAVE
1737	GCM_ENC_DEC dec
1738	FUNC_RESTORE
1739	ret
1740ENDPROC(aesni_gcm_dec_update)
1741
1742/*****************************************************************************
1743* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1744*                    struct gcm_context_data *data,
1745*                                        // context data
1746*                    u8 *auth_tag,       // Authenticated Tag output.
1747*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1748*                                        // 12 or 8.
1749*/
1750ENTRY(aesni_gcm_finalize)
1751	FUNC_SAVE
1752	GCM_COMPLETE %arg3 %arg4
1753	FUNC_RESTORE
1754	ret
1755ENDPROC(aesni_gcm_finalize)
1756
1757#endif
1758
1759
1760.align 4
1761_key_expansion_128:
1762_key_expansion_256a:
1763	pshufd $0b11111111, %xmm1, %xmm1
1764	shufps $0b00010000, %xmm0, %xmm4
1765	pxor %xmm4, %xmm0
1766	shufps $0b10001100, %xmm0, %xmm4
1767	pxor %xmm4, %xmm0
1768	pxor %xmm1, %xmm0
1769	movaps %xmm0, (TKEYP)
1770	add $0x10, TKEYP
1771	ret
1772ENDPROC(_key_expansion_128)
1773ENDPROC(_key_expansion_256a)
1774
1775.align 4
1776_key_expansion_192a:
1777	pshufd $0b01010101, %xmm1, %xmm1
1778	shufps $0b00010000, %xmm0, %xmm4
1779	pxor %xmm4, %xmm0
1780	shufps $0b10001100, %xmm0, %xmm4
1781	pxor %xmm4, %xmm0
1782	pxor %xmm1, %xmm0
1783
1784	movaps %xmm2, %xmm5
1785	movaps %xmm2, %xmm6
1786	pslldq $4, %xmm5
1787	pshufd $0b11111111, %xmm0, %xmm3
1788	pxor %xmm3, %xmm2
1789	pxor %xmm5, %xmm2
1790
1791	movaps %xmm0, %xmm1
1792	shufps $0b01000100, %xmm0, %xmm6
1793	movaps %xmm6, (TKEYP)
1794	shufps $0b01001110, %xmm2, %xmm1
1795	movaps %xmm1, 0x10(TKEYP)
1796	add $0x20, TKEYP
1797	ret
1798ENDPROC(_key_expansion_192a)
1799
1800.align 4
1801_key_expansion_192b:
1802	pshufd $0b01010101, %xmm1, %xmm1
1803	shufps $0b00010000, %xmm0, %xmm4
1804	pxor %xmm4, %xmm0
1805	shufps $0b10001100, %xmm0, %xmm4
1806	pxor %xmm4, %xmm0
1807	pxor %xmm1, %xmm0
1808
1809	movaps %xmm2, %xmm5
1810	pslldq $4, %xmm5
1811	pshufd $0b11111111, %xmm0, %xmm3
1812	pxor %xmm3, %xmm2
1813	pxor %xmm5, %xmm2
1814
1815	movaps %xmm0, (TKEYP)
1816	add $0x10, TKEYP
1817	ret
1818ENDPROC(_key_expansion_192b)
1819
1820.align 4
1821_key_expansion_256b:
1822	pshufd $0b10101010, %xmm1, %xmm1
1823	shufps $0b00010000, %xmm2, %xmm4
1824	pxor %xmm4, %xmm2
1825	shufps $0b10001100, %xmm2, %xmm4
1826	pxor %xmm4, %xmm2
1827	pxor %xmm1, %xmm2
1828	movaps %xmm2, (TKEYP)
1829	add $0x10, TKEYP
1830	ret
1831ENDPROC(_key_expansion_256b)
1832
1833/*
1834 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1835 *                   unsigned int key_len)
1836 */
1837ENTRY(aesni_set_key)
1838	FRAME_BEGIN
1839#ifndef __x86_64__
1840	pushl KEYP
1841	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1842	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1843	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1844#endif
1845	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1846	movaps %xmm0, (KEYP)
1847	lea 0x10(KEYP), TKEYP		# key addr
1848	movl %edx, 480(KEYP)
1849	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1850	cmp $24, %dl
1851	jb .Lenc_key128
1852	je .Lenc_key192
1853	movups 0x10(UKEYP), %xmm2	# other user key
1854	movaps %xmm2, (TKEYP)
1855	add $0x10, TKEYP
1856	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1857	call _key_expansion_256a
1858	AESKEYGENASSIST 0x1 %xmm0 %xmm1
1859	call _key_expansion_256b
1860	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1861	call _key_expansion_256a
1862	AESKEYGENASSIST 0x2 %xmm0 %xmm1
1863	call _key_expansion_256b
1864	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1865	call _key_expansion_256a
1866	AESKEYGENASSIST 0x4 %xmm0 %xmm1
1867	call _key_expansion_256b
1868	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1869	call _key_expansion_256a
1870	AESKEYGENASSIST 0x8 %xmm0 %xmm1
1871	call _key_expansion_256b
1872	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1873	call _key_expansion_256a
1874	AESKEYGENASSIST 0x10 %xmm0 %xmm1
1875	call _key_expansion_256b
1876	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1877	call _key_expansion_256a
1878	AESKEYGENASSIST 0x20 %xmm0 %xmm1
1879	call _key_expansion_256b
1880	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1881	call _key_expansion_256a
1882	jmp .Ldec_key
1883.Lenc_key192:
1884	movq 0x10(UKEYP), %xmm2		# other user key
1885	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
1886	call _key_expansion_192a
1887	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
1888	call _key_expansion_192b
1889	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
1890	call _key_expansion_192a
1891	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
1892	call _key_expansion_192b
1893	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
1894	call _key_expansion_192a
1895	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
1896	call _key_expansion_192b
1897	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
1898	call _key_expansion_192a
1899	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
1900	call _key_expansion_192b
1901	jmp .Ldec_key
1902.Lenc_key128:
1903	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
1904	call _key_expansion_128
1905	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
1906	call _key_expansion_128
1907	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
1908	call _key_expansion_128
1909	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
1910	call _key_expansion_128
1911	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
1912	call _key_expansion_128
1913	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
1914	call _key_expansion_128
1915	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
1916	call _key_expansion_128
1917	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
1918	call _key_expansion_128
1919	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
1920	call _key_expansion_128
1921	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
1922	call _key_expansion_128
1923.Ldec_key:
1924	sub $0x10, TKEYP
1925	movaps (KEYP), %xmm0
1926	movaps (TKEYP), %xmm1
1927	movaps %xmm0, 240(TKEYP)
1928	movaps %xmm1, 240(KEYP)
1929	add $0x10, KEYP
1930	lea 240-16(TKEYP), UKEYP
1931.align 4
1932.Ldec_key_loop:
1933	movaps (KEYP), %xmm0
1934	AESIMC %xmm0 %xmm1
1935	movaps %xmm1, (UKEYP)
1936	add $0x10, KEYP
1937	sub $0x10, UKEYP
1938	cmp TKEYP, KEYP
1939	jb .Ldec_key_loop
1940	xor AREG, AREG
1941#ifndef __x86_64__
1942	popl KEYP
1943#endif
1944	FRAME_END
1945	ret
1946ENDPROC(aesni_set_key)
1947
1948/*
1949 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1950 */
1951ENTRY(aesni_enc)
1952	FRAME_BEGIN
1953#ifndef __x86_64__
1954	pushl KEYP
1955	pushl KLEN
1956	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1957	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1958	movl (FRAME_OFFSET+20)(%esp), INP	# src
1959#endif
1960	movl 480(KEYP), KLEN		# key length
1961	movups (INP), STATE		# input
1962	call _aesni_enc1
1963	movups STATE, (OUTP)		# output
1964#ifndef __x86_64__
1965	popl KLEN
1966	popl KEYP
1967#endif
1968	FRAME_END
1969	ret
1970ENDPROC(aesni_enc)
1971
1972/*
1973 * _aesni_enc1:		internal ABI
1974 * input:
1975 *	KEYP:		key struct pointer
1976 *	KLEN:		round count
1977 *	STATE:		initial state (input)
1978 * output:
1979 *	STATE:		finial state (output)
1980 * changed:
1981 *	KEY
1982 *	TKEYP (T1)
1983 */
1984.align 4
1985_aesni_enc1:
1986	movaps (KEYP), KEY		# key
1987	mov KEYP, TKEYP
1988	pxor KEY, STATE		# round 0
1989	add $0x30, TKEYP
1990	cmp $24, KLEN
1991	jb .Lenc128
1992	lea 0x20(TKEYP), TKEYP
1993	je .Lenc192
1994	add $0x20, TKEYP
1995	movaps -0x60(TKEYP), KEY
1996	AESENC KEY STATE
1997	movaps -0x50(TKEYP), KEY
1998	AESENC KEY STATE
1999.align 4
2000.Lenc192:
2001	movaps -0x40(TKEYP), KEY
2002	AESENC KEY STATE
2003	movaps -0x30(TKEYP), KEY
2004	AESENC KEY STATE
2005.align 4
2006.Lenc128:
2007	movaps -0x20(TKEYP), KEY
2008	AESENC KEY STATE
2009	movaps -0x10(TKEYP), KEY
2010	AESENC KEY STATE
2011	movaps (TKEYP), KEY
2012	AESENC KEY STATE
2013	movaps 0x10(TKEYP), KEY
2014	AESENC KEY STATE
2015	movaps 0x20(TKEYP), KEY
2016	AESENC KEY STATE
2017	movaps 0x30(TKEYP), KEY
2018	AESENC KEY STATE
2019	movaps 0x40(TKEYP), KEY
2020	AESENC KEY STATE
2021	movaps 0x50(TKEYP), KEY
2022	AESENC KEY STATE
2023	movaps 0x60(TKEYP), KEY
2024	AESENC KEY STATE
2025	movaps 0x70(TKEYP), KEY
2026	AESENCLAST KEY STATE
2027	ret
2028ENDPROC(_aesni_enc1)
2029
2030/*
2031 * _aesni_enc4:	internal ABI
2032 * input:
2033 *	KEYP:		key struct pointer
2034 *	KLEN:		round count
2035 *	STATE1:		initial state (input)
2036 *	STATE2
2037 *	STATE3
2038 *	STATE4
2039 * output:
2040 *	STATE1:		finial state (output)
2041 *	STATE2
2042 *	STATE3
2043 *	STATE4
2044 * changed:
2045 *	KEY
2046 *	TKEYP (T1)
2047 */
2048.align 4
2049_aesni_enc4:
2050	movaps (KEYP), KEY		# key
2051	mov KEYP, TKEYP
2052	pxor KEY, STATE1		# round 0
2053	pxor KEY, STATE2
2054	pxor KEY, STATE3
2055	pxor KEY, STATE4
2056	add $0x30, TKEYP
2057	cmp $24, KLEN
2058	jb .L4enc128
2059	lea 0x20(TKEYP), TKEYP
2060	je .L4enc192
2061	add $0x20, TKEYP
2062	movaps -0x60(TKEYP), KEY
2063	AESENC KEY STATE1
2064	AESENC KEY STATE2
2065	AESENC KEY STATE3
2066	AESENC KEY STATE4
2067	movaps -0x50(TKEYP), KEY
2068	AESENC KEY STATE1
2069	AESENC KEY STATE2
2070	AESENC KEY STATE3
2071	AESENC KEY STATE4
2072#.align 4
2073.L4enc192:
2074	movaps -0x40(TKEYP), KEY
2075	AESENC KEY STATE1
2076	AESENC KEY STATE2
2077	AESENC KEY STATE3
2078	AESENC KEY STATE4
2079	movaps -0x30(TKEYP), KEY
2080	AESENC KEY STATE1
2081	AESENC KEY STATE2
2082	AESENC KEY STATE3
2083	AESENC KEY STATE4
2084#.align 4
2085.L4enc128:
2086	movaps -0x20(TKEYP), KEY
2087	AESENC KEY STATE1
2088	AESENC KEY STATE2
2089	AESENC KEY STATE3
2090	AESENC KEY STATE4
2091	movaps -0x10(TKEYP), KEY
2092	AESENC KEY STATE1
2093	AESENC KEY STATE2
2094	AESENC KEY STATE3
2095	AESENC KEY STATE4
2096	movaps (TKEYP), KEY
2097	AESENC KEY STATE1
2098	AESENC KEY STATE2
2099	AESENC KEY STATE3
2100	AESENC KEY STATE4
2101	movaps 0x10(TKEYP), KEY
2102	AESENC KEY STATE1
2103	AESENC KEY STATE2
2104	AESENC KEY STATE3
2105	AESENC KEY STATE4
2106	movaps 0x20(TKEYP), KEY
2107	AESENC KEY STATE1
2108	AESENC KEY STATE2
2109	AESENC KEY STATE3
2110	AESENC KEY STATE4
2111	movaps 0x30(TKEYP), KEY
2112	AESENC KEY STATE1
2113	AESENC KEY STATE2
2114	AESENC KEY STATE3
2115	AESENC KEY STATE4
2116	movaps 0x40(TKEYP), KEY
2117	AESENC KEY STATE1
2118	AESENC KEY STATE2
2119	AESENC KEY STATE3
2120	AESENC KEY STATE4
2121	movaps 0x50(TKEYP), KEY
2122	AESENC KEY STATE1
2123	AESENC KEY STATE2
2124	AESENC KEY STATE3
2125	AESENC KEY STATE4
2126	movaps 0x60(TKEYP), KEY
2127	AESENC KEY STATE1
2128	AESENC KEY STATE2
2129	AESENC KEY STATE3
2130	AESENC KEY STATE4
2131	movaps 0x70(TKEYP), KEY
2132	AESENCLAST KEY STATE1		# last round
2133	AESENCLAST KEY STATE2
2134	AESENCLAST KEY STATE3
2135	AESENCLAST KEY STATE4
2136	ret
2137ENDPROC(_aesni_enc4)
2138
2139/*
2140 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2141 */
2142ENTRY(aesni_dec)
2143	FRAME_BEGIN
2144#ifndef __x86_64__
2145	pushl KEYP
2146	pushl KLEN
2147	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2148	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2149	movl (FRAME_OFFSET+20)(%esp), INP	# src
2150#endif
2151	mov 480(KEYP), KLEN		# key length
2152	add $240, KEYP
2153	movups (INP), STATE		# input
2154	call _aesni_dec1
2155	movups STATE, (OUTP)		#output
2156#ifndef __x86_64__
2157	popl KLEN
2158	popl KEYP
2159#endif
2160	FRAME_END
2161	ret
2162ENDPROC(aesni_dec)
2163
2164/*
2165 * _aesni_dec1:		internal ABI
2166 * input:
2167 *	KEYP:		key struct pointer
2168 *	KLEN:		key length
2169 *	STATE:		initial state (input)
2170 * output:
2171 *	STATE:		finial state (output)
2172 * changed:
2173 *	KEY
2174 *	TKEYP (T1)
2175 */
2176.align 4
2177_aesni_dec1:
2178	movaps (KEYP), KEY		# key
2179	mov KEYP, TKEYP
2180	pxor KEY, STATE		# round 0
2181	add $0x30, TKEYP
2182	cmp $24, KLEN
2183	jb .Ldec128
2184	lea 0x20(TKEYP), TKEYP
2185	je .Ldec192
2186	add $0x20, TKEYP
2187	movaps -0x60(TKEYP), KEY
2188	AESDEC KEY STATE
2189	movaps -0x50(TKEYP), KEY
2190	AESDEC KEY STATE
2191.align 4
2192.Ldec192:
2193	movaps -0x40(TKEYP), KEY
2194	AESDEC KEY STATE
2195	movaps -0x30(TKEYP), KEY
2196	AESDEC KEY STATE
2197.align 4
2198.Ldec128:
2199	movaps -0x20(TKEYP), KEY
2200	AESDEC KEY STATE
2201	movaps -0x10(TKEYP), KEY
2202	AESDEC KEY STATE
2203	movaps (TKEYP), KEY
2204	AESDEC KEY STATE
2205	movaps 0x10(TKEYP), KEY
2206	AESDEC KEY STATE
2207	movaps 0x20(TKEYP), KEY
2208	AESDEC KEY STATE
2209	movaps 0x30(TKEYP), KEY
2210	AESDEC KEY STATE
2211	movaps 0x40(TKEYP), KEY
2212	AESDEC KEY STATE
2213	movaps 0x50(TKEYP), KEY
2214	AESDEC KEY STATE
2215	movaps 0x60(TKEYP), KEY
2216	AESDEC KEY STATE
2217	movaps 0x70(TKEYP), KEY
2218	AESDECLAST KEY STATE
2219	ret
2220ENDPROC(_aesni_dec1)
2221
2222/*
2223 * _aesni_dec4:	internal ABI
2224 * input:
2225 *	KEYP:		key struct pointer
2226 *	KLEN:		key length
2227 *	STATE1:		initial state (input)
2228 *	STATE2
2229 *	STATE3
2230 *	STATE4
2231 * output:
2232 *	STATE1:		finial state (output)
2233 *	STATE2
2234 *	STATE3
2235 *	STATE4
2236 * changed:
2237 *	KEY
2238 *	TKEYP (T1)
2239 */
2240.align 4
2241_aesni_dec4:
2242	movaps (KEYP), KEY		# key
2243	mov KEYP, TKEYP
2244	pxor KEY, STATE1		# round 0
2245	pxor KEY, STATE2
2246	pxor KEY, STATE3
2247	pxor KEY, STATE4
2248	add $0x30, TKEYP
2249	cmp $24, KLEN
2250	jb .L4dec128
2251	lea 0x20(TKEYP), TKEYP
2252	je .L4dec192
2253	add $0x20, TKEYP
2254	movaps -0x60(TKEYP), KEY
2255	AESDEC KEY STATE1
2256	AESDEC KEY STATE2
2257	AESDEC KEY STATE3
2258	AESDEC KEY STATE4
2259	movaps -0x50(TKEYP), KEY
2260	AESDEC KEY STATE1
2261	AESDEC KEY STATE2
2262	AESDEC KEY STATE3
2263	AESDEC KEY STATE4
2264.align 4
2265.L4dec192:
2266	movaps -0x40(TKEYP), KEY
2267	AESDEC KEY STATE1
2268	AESDEC KEY STATE2
2269	AESDEC KEY STATE3
2270	AESDEC KEY STATE4
2271	movaps -0x30(TKEYP), KEY
2272	AESDEC KEY STATE1
2273	AESDEC KEY STATE2
2274	AESDEC KEY STATE3
2275	AESDEC KEY STATE4
2276.align 4
2277.L4dec128:
2278	movaps -0x20(TKEYP), KEY
2279	AESDEC KEY STATE1
2280	AESDEC KEY STATE2
2281	AESDEC KEY STATE3
2282	AESDEC KEY STATE4
2283	movaps -0x10(TKEYP), KEY
2284	AESDEC KEY STATE1
2285	AESDEC KEY STATE2
2286	AESDEC KEY STATE3
2287	AESDEC KEY STATE4
2288	movaps (TKEYP), KEY
2289	AESDEC KEY STATE1
2290	AESDEC KEY STATE2
2291	AESDEC KEY STATE3
2292	AESDEC KEY STATE4
2293	movaps 0x10(TKEYP), KEY
2294	AESDEC KEY STATE1
2295	AESDEC KEY STATE2
2296	AESDEC KEY STATE3
2297	AESDEC KEY STATE4
2298	movaps 0x20(TKEYP), KEY
2299	AESDEC KEY STATE1
2300	AESDEC KEY STATE2
2301	AESDEC KEY STATE3
2302	AESDEC KEY STATE4
2303	movaps 0x30(TKEYP), KEY
2304	AESDEC KEY STATE1
2305	AESDEC KEY STATE2
2306	AESDEC KEY STATE3
2307	AESDEC KEY STATE4
2308	movaps 0x40(TKEYP), KEY
2309	AESDEC KEY STATE1
2310	AESDEC KEY STATE2
2311	AESDEC KEY STATE3
2312	AESDEC KEY STATE4
2313	movaps 0x50(TKEYP), KEY
2314	AESDEC KEY STATE1
2315	AESDEC KEY STATE2
2316	AESDEC KEY STATE3
2317	AESDEC KEY STATE4
2318	movaps 0x60(TKEYP), KEY
2319	AESDEC KEY STATE1
2320	AESDEC KEY STATE2
2321	AESDEC KEY STATE3
2322	AESDEC KEY STATE4
2323	movaps 0x70(TKEYP), KEY
2324	AESDECLAST KEY STATE1		# last round
2325	AESDECLAST KEY STATE2
2326	AESDECLAST KEY STATE3
2327	AESDECLAST KEY STATE4
2328	ret
2329ENDPROC(_aesni_dec4)
2330
2331/*
2332 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2333 *		      size_t len)
2334 */
2335ENTRY(aesni_ecb_enc)
2336	FRAME_BEGIN
2337#ifndef __x86_64__
2338	pushl LEN
2339	pushl KEYP
2340	pushl KLEN
2341	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2342	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2343	movl (FRAME_OFFSET+24)(%esp), INP	# src
2344	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2345#endif
2346	test LEN, LEN		# check length
2347	jz .Lecb_enc_ret
2348	mov 480(KEYP), KLEN
2349	cmp $16, LEN
2350	jb .Lecb_enc_ret
2351	cmp $64, LEN
2352	jb .Lecb_enc_loop1
2353.align 4
2354.Lecb_enc_loop4:
2355	movups (INP), STATE1
2356	movups 0x10(INP), STATE2
2357	movups 0x20(INP), STATE3
2358	movups 0x30(INP), STATE4
2359	call _aesni_enc4
2360	movups STATE1, (OUTP)
2361	movups STATE2, 0x10(OUTP)
2362	movups STATE3, 0x20(OUTP)
2363	movups STATE4, 0x30(OUTP)
2364	sub $64, LEN
2365	add $64, INP
2366	add $64, OUTP
2367	cmp $64, LEN
2368	jge .Lecb_enc_loop4
2369	cmp $16, LEN
2370	jb .Lecb_enc_ret
2371.align 4
2372.Lecb_enc_loop1:
2373	movups (INP), STATE1
2374	call _aesni_enc1
2375	movups STATE1, (OUTP)
2376	sub $16, LEN
2377	add $16, INP
2378	add $16, OUTP
2379	cmp $16, LEN
2380	jge .Lecb_enc_loop1
2381.Lecb_enc_ret:
2382#ifndef __x86_64__
2383	popl KLEN
2384	popl KEYP
2385	popl LEN
2386#endif
2387	FRAME_END
2388	ret
2389ENDPROC(aesni_ecb_enc)
2390
2391/*
2392 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2393 *		      size_t len);
2394 */
2395ENTRY(aesni_ecb_dec)
2396	FRAME_BEGIN
2397#ifndef __x86_64__
2398	pushl LEN
2399	pushl KEYP
2400	pushl KLEN
2401	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2402	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2403	movl (FRAME_OFFSET+24)(%esp), INP	# src
2404	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2405#endif
2406	test LEN, LEN
2407	jz .Lecb_dec_ret
2408	mov 480(KEYP), KLEN
2409	add $240, KEYP
2410	cmp $16, LEN
2411	jb .Lecb_dec_ret
2412	cmp $64, LEN
2413	jb .Lecb_dec_loop1
2414.align 4
2415.Lecb_dec_loop4:
2416	movups (INP), STATE1
2417	movups 0x10(INP), STATE2
2418	movups 0x20(INP), STATE3
2419	movups 0x30(INP), STATE4
2420	call _aesni_dec4
2421	movups STATE1, (OUTP)
2422	movups STATE2, 0x10(OUTP)
2423	movups STATE3, 0x20(OUTP)
2424	movups STATE4, 0x30(OUTP)
2425	sub $64, LEN
2426	add $64, INP
2427	add $64, OUTP
2428	cmp $64, LEN
2429	jge .Lecb_dec_loop4
2430	cmp $16, LEN
2431	jb .Lecb_dec_ret
2432.align 4
2433.Lecb_dec_loop1:
2434	movups (INP), STATE1
2435	call _aesni_dec1
2436	movups STATE1, (OUTP)
2437	sub $16, LEN
2438	add $16, INP
2439	add $16, OUTP
2440	cmp $16, LEN
2441	jge .Lecb_dec_loop1
2442.Lecb_dec_ret:
2443#ifndef __x86_64__
2444	popl KLEN
2445	popl KEYP
2446	popl LEN
2447#endif
2448	FRAME_END
2449	ret
2450ENDPROC(aesni_ecb_dec)
2451
2452/*
2453 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2454 *		      size_t len, u8 *iv)
2455 */
2456ENTRY(aesni_cbc_enc)
2457	FRAME_BEGIN
2458#ifndef __x86_64__
2459	pushl IVP
2460	pushl LEN
2461	pushl KEYP
2462	pushl KLEN
2463	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2464	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2465	movl (FRAME_OFFSET+28)(%esp), INP	# src
2466	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2467	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2468#endif
2469	cmp $16, LEN
2470	jb .Lcbc_enc_ret
2471	mov 480(KEYP), KLEN
2472	movups (IVP), STATE	# load iv as initial state
2473.align 4
2474.Lcbc_enc_loop:
2475	movups (INP), IN	# load input
2476	pxor IN, STATE
2477	call _aesni_enc1
2478	movups STATE, (OUTP)	# store output
2479	sub $16, LEN
2480	add $16, INP
2481	add $16, OUTP
2482	cmp $16, LEN
2483	jge .Lcbc_enc_loop
2484	movups STATE, (IVP)
2485.Lcbc_enc_ret:
2486#ifndef __x86_64__
2487	popl KLEN
2488	popl KEYP
2489	popl LEN
2490	popl IVP
2491#endif
2492	FRAME_END
2493	ret
2494ENDPROC(aesni_cbc_enc)
2495
2496/*
2497 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2498 *		      size_t len, u8 *iv)
2499 */
2500ENTRY(aesni_cbc_dec)
2501	FRAME_BEGIN
2502#ifndef __x86_64__
2503	pushl IVP
2504	pushl LEN
2505	pushl KEYP
2506	pushl KLEN
2507	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2508	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2509	movl (FRAME_OFFSET+28)(%esp), INP	# src
2510	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2511	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2512#endif
2513	cmp $16, LEN
2514	jb .Lcbc_dec_just_ret
2515	mov 480(KEYP), KLEN
2516	add $240, KEYP
2517	movups (IVP), IV
2518	cmp $64, LEN
2519	jb .Lcbc_dec_loop1
2520.align 4
2521.Lcbc_dec_loop4:
2522	movups (INP), IN1
2523	movaps IN1, STATE1
2524	movups 0x10(INP), IN2
2525	movaps IN2, STATE2
2526#ifdef __x86_64__
2527	movups 0x20(INP), IN3
2528	movaps IN3, STATE3
2529	movups 0x30(INP), IN4
2530	movaps IN4, STATE4
2531#else
2532	movups 0x20(INP), IN1
2533	movaps IN1, STATE3
2534	movups 0x30(INP), IN2
2535	movaps IN2, STATE4
2536#endif
2537	call _aesni_dec4
2538	pxor IV, STATE1
2539#ifdef __x86_64__
2540	pxor IN1, STATE2
2541	pxor IN2, STATE3
2542	pxor IN3, STATE4
2543	movaps IN4, IV
2544#else
2545	pxor IN1, STATE4
2546	movaps IN2, IV
2547	movups (INP), IN1
2548	pxor IN1, STATE2
2549	movups 0x10(INP), IN2
2550	pxor IN2, STATE3
2551#endif
2552	movups STATE1, (OUTP)
2553	movups STATE2, 0x10(OUTP)
2554	movups STATE3, 0x20(OUTP)
2555	movups STATE4, 0x30(OUTP)
2556	sub $64, LEN
2557	add $64, INP
2558	add $64, OUTP
2559	cmp $64, LEN
2560	jge .Lcbc_dec_loop4
2561	cmp $16, LEN
2562	jb .Lcbc_dec_ret
2563.align 4
2564.Lcbc_dec_loop1:
2565	movups (INP), IN
2566	movaps IN, STATE
2567	call _aesni_dec1
2568	pxor IV, STATE
2569	movups STATE, (OUTP)
2570	movaps IN, IV
2571	sub $16, LEN
2572	add $16, INP
2573	add $16, OUTP
2574	cmp $16, LEN
2575	jge .Lcbc_dec_loop1
2576.Lcbc_dec_ret:
2577	movups IV, (IVP)
2578.Lcbc_dec_just_ret:
2579#ifndef __x86_64__
2580	popl KLEN
2581	popl KEYP
2582	popl LEN
2583	popl IVP
2584#endif
2585	FRAME_END
2586	ret
2587ENDPROC(aesni_cbc_dec)
2588
2589#ifdef __x86_64__
2590.pushsection .rodata
2591.align 16
2592.Lbswap_mask:
2593	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2594.popsection
2595
2596/*
2597 * _aesni_inc_init:	internal ABI
2598 *	setup registers used by _aesni_inc
2599 * input:
2600 *	IV
2601 * output:
2602 *	CTR:	== IV, in little endian
2603 *	TCTR_LOW: == lower qword of CTR
2604 *	INC:	== 1, in little endian
2605 *	BSWAP_MASK == endian swapping mask
2606 */
2607.align 4
2608_aesni_inc_init:
2609	movaps .Lbswap_mask, BSWAP_MASK
2610	movaps IV, CTR
2611	PSHUFB_XMM BSWAP_MASK CTR
2612	mov $1, TCTR_LOW
2613	MOVQ_R64_XMM TCTR_LOW INC
2614	MOVQ_R64_XMM CTR TCTR_LOW
2615	ret
2616ENDPROC(_aesni_inc_init)
2617
2618/*
2619 * _aesni_inc:		internal ABI
2620 *	Increase IV by 1, IV is in big endian
2621 * input:
2622 *	IV
2623 *	CTR:	== IV, in little endian
2624 *	TCTR_LOW: == lower qword of CTR
2625 *	INC:	== 1, in little endian
2626 *	BSWAP_MASK == endian swapping mask
2627 * output:
2628 *	IV:	Increase by 1
2629 * changed:
2630 *	CTR:	== output IV, in little endian
2631 *	TCTR_LOW: == lower qword of CTR
2632 */
2633.align 4
2634_aesni_inc:
2635	paddq INC, CTR
2636	add $1, TCTR_LOW
2637	jnc .Linc_low
2638	pslldq $8, INC
2639	paddq INC, CTR
2640	psrldq $8, INC
2641.Linc_low:
2642	movaps CTR, IV
2643	PSHUFB_XMM BSWAP_MASK IV
2644	ret
2645ENDPROC(_aesni_inc)
2646
2647/*
2648 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2649 *		      size_t len, u8 *iv)
2650 */
2651ENTRY(aesni_ctr_enc)
2652	FRAME_BEGIN
2653	cmp $16, LEN
2654	jb .Lctr_enc_just_ret
2655	mov 480(KEYP), KLEN
2656	movups (IVP), IV
2657	call _aesni_inc_init
2658	cmp $64, LEN
2659	jb .Lctr_enc_loop1
2660.align 4
2661.Lctr_enc_loop4:
2662	movaps IV, STATE1
2663	call _aesni_inc
2664	movups (INP), IN1
2665	movaps IV, STATE2
2666	call _aesni_inc
2667	movups 0x10(INP), IN2
2668	movaps IV, STATE3
2669	call _aesni_inc
2670	movups 0x20(INP), IN3
2671	movaps IV, STATE4
2672	call _aesni_inc
2673	movups 0x30(INP), IN4
2674	call _aesni_enc4
2675	pxor IN1, STATE1
2676	movups STATE1, (OUTP)
2677	pxor IN2, STATE2
2678	movups STATE2, 0x10(OUTP)
2679	pxor IN3, STATE3
2680	movups STATE3, 0x20(OUTP)
2681	pxor IN4, STATE4
2682	movups STATE4, 0x30(OUTP)
2683	sub $64, LEN
2684	add $64, INP
2685	add $64, OUTP
2686	cmp $64, LEN
2687	jge .Lctr_enc_loop4
2688	cmp $16, LEN
2689	jb .Lctr_enc_ret
2690.align 4
2691.Lctr_enc_loop1:
2692	movaps IV, STATE
2693	call _aesni_inc
2694	movups (INP), IN
2695	call _aesni_enc1
2696	pxor IN, STATE
2697	movups STATE, (OUTP)
2698	sub $16, LEN
2699	add $16, INP
2700	add $16, OUTP
2701	cmp $16, LEN
2702	jge .Lctr_enc_loop1
2703.Lctr_enc_ret:
2704	movups IV, (IVP)
2705.Lctr_enc_just_ret:
2706	FRAME_END
2707	ret
2708ENDPROC(aesni_ctr_enc)
2709
2710/*
2711 * _aesni_gf128mul_x_ble:		internal ABI
2712 *	Multiply in GF(2^128) for XTS IVs
2713 * input:
2714 *	IV:	current IV
2715 *	GF128MUL_MASK == mask with 0x87 and 0x01
2716 * output:
2717 *	IV:	next IV
2718 * changed:
2719 *	CTR:	== temporary value
2720 */
2721#define _aesni_gf128mul_x_ble() \
2722	pshufd $0x13, IV, CTR; \
2723	paddq IV, IV; \
2724	psrad $31, CTR; \
2725	pand GF128MUL_MASK, CTR; \
2726	pxor CTR, IV;
2727
2728/*
2729 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2730 *			 bool enc, u8 *iv)
2731 */
2732ENTRY(aesni_xts_crypt8)
2733	FRAME_BEGIN
2734	cmpb $0, %cl
2735	movl $0, %ecx
2736	movl $240, %r10d
2737	leaq _aesni_enc4, %r11
2738	leaq _aesni_dec4, %rax
2739	cmovel %r10d, %ecx
2740	cmoveq %rax, %r11
2741
2742	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2743	movups (IVP), IV
2744
2745	mov 480(KEYP), KLEN
2746	addq %rcx, KEYP
2747
2748	movdqa IV, STATE1
2749	movdqu 0x00(INP), INC
2750	pxor INC, STATE1
2751	movdqu IV, 0x00(OUTP)
2752
2753	_aesni_gf128mul_x_ble()
2754	movdqa IV, STATE2
2755	movdqu 0x10(INP), INC
2756	pxor INC, STATE2
2757	movdqu IV, 0x10(OUTP)
2758
2759	_aesni_gf128mul_x_ble()
2760	movdqa IV, STATE3
2761	movdqu 0x20(INP), INC
2762	pxor INC, STATE3
2763	movdqu IV, 0x20(OUTP)
2764
2765	_aesni_gf128mul_x_ble()
2766	movdqa IV, STATE4
2767	movdqu 0x30(INP), INC
2768	pxor INC, STATE4
2769	movdqu IV, 0x30(OUTP)
2770
2771	CALL_NOSPEC %r11
2772
2773	movdqu 0x00(OUTP), INC
2774	pxor INC, STATE1
2775	movdqu STATE1, 0x00(OUTP)
2776
2777	_aesni_gf128mul_x_ble()
2778	movdqa IV, STATE1
2779	movdqu 0x40(INP), INC
2780	pxor INC, STATE1
2781	movdqu IV, 0x40(OUTP)
2782
2783	movdqu 0x10(OUTP), INC
2784	pxor INC, STATE2
2785	movdqu STATE2, 0x10(OUTP)
2786
2787	_aesni_gf128mul_x_ble()
2788	movdqa IV, STATE2
2789	movdqu 0x50(INP), INC
2790	pxor INC, STATE2
2791	movdqu IV, 0x50(OUTP)
2792
2793	movdqu 0x20(OUTP), INC
2794	pxor INC, STATE3
2795	movdqu STATE3, 0x20(OUTP)
2796
2797	_aesni_gf128mul_x_ble()
2798	movdqa IV, STATE3
2799	movdqu 0x60(INP), INC
2800	pxor INC, STATE3
2801	movdqu IV, 0x60(OUTP)
2802
2803	movdqu 0x30(OUTP), INC
2804	pxor INC, STATE4
2805	movdqu STATE4, 0x30(OUTP)
2806
2807	_aesni_gf128mul_x_ble()
2808	movdqa IV, STATE4
2809	movdqu 0x70(INP), INC
2810	pxor INC, STATE4
2811	movdqu IV, 0x70(OUTP)
2812
2813	_aesni_gf128mul_x_ble()
2814	movups IV, (IVP)
2815
2816	CALL_NOSPEC %r11
2817
2818	movdqu 0x40(OUTP), INC
2819	pxor INC, STATE1
2820	movdqu STATE1, 0x40(OUTP)
2821
2822	movdqu 0x50(OUTP), INC
2823	pxor INC, STATE2
2824	movdqu STATE2, 0x50(OUTP)
2825
2826	movdqu 0x60(OUTP), INC
2827	pxor INC, STATE3
2828	movdqu STATE3, 0x60(OUTP)
2829
2830	movdqu 0x70(OUTP), INC
2831	pxor INC, STATE4
2832	movdqu STATE4, 0x70(OUTP)
2833
2834	FRAME_END
2835	ret
2836ENDPROC(aesni_xts_crypt8)
2837
2838#endif
2839