1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 */
17
18#include <linux/linkage.h>
19#include <asm/inst.h>
20
21.text
22
23#define STATE1	%xmm0
24#define STATE2	%xmm4
25#define STATE3	%xmm5
26#define STATE4	%xmm6
27#define STATE	STATE1
28#define IN1	%xmm1
29#define IN2	%xmm7
30#define IN3	%xmm8
31#define IN4	%xmm9
32#define IN	IN1
33#define KEY	%xmm2
34#define IV	%xmm3
35
36#define KEYP	%rdi
37#define OUTP	%rsi
38#define INP	%rdx
39#define LEN	%rcx
40#define IVP	%r8
41#define KLEN	%r9d
42#define T1	%r10
43#define TKEYP	T1
44#define T2	%r11
45
46_key_expansion_128:
47_key_expansion_256a:
48	pshufd $0b11111111, %xmm1, %xmm1
49	shufps $0b00010000, %xmm0, %xmm4
50	pxor %xmm4, %xmm0
51	shufps $0b10001100, %xmm0, %xmm4
52	pxor %xmm4, %xmm0
53	pxor %xmm1, %xmm0
54	movaps %xmm0, (%rcx)
55	add $0x10, %rcx
56	ret
57
58_key_expansion_192a:
59	pshufd $0b01010101, %xmm1, %xmm1
60	shufps $0b00010000, %xmm0, %xmm4
61	pxor %xmm4, %xmm0
62	shufps $0b10001100, %xmm0, %xmm4
63	pxor %xmm4, %xmm0
64	pxor %xmm1, %xmm0
65
66	movaps %xmm2, %xmm5
67	movaps %xmm2, %xmm6
68	pslldq $4, %xmm5
69	pshufd $0b11111111, %xmm0, %xmm3
70	pxor %xmm3, %xmm2
71	pxor %xmm5, %xmm2
72
73	movaps %xmm0, %xmm1
74	shufps $0b01000100, %xmm0, %xmm6
75	movaps %xmm6, (%rcx)
76	shufps $0b01001110, %xmm2, %xmm1
77	movaps %xmm1, 16(%rcx)
78	add $0x20, %rcx
79	ret
80
81_key_expansion_192b:
82	pshufd $0b01010101, %xmm1, %xmm1
83	shufps $0b00010000, %xmm0, %xmm4
84	pxor %xmm4, %xmm0
85	shufps $0b10001100, %xmm0, %xmm4
86	pxor %xmm4, %xmm0
87	pxor %xmm1, %xmm0
88
89	movaps %xmm2, %xmm5
90	pslldq $4, %xmm5
91	pshufd $0b11111111, %xmm0, %xmm3
92	pxor %xmm3, %xmm2
93	pxor %xmm5, %xmm2
94
95	movaps %xmm0, (%rcx)
96	add $0x10, %rcx
97	ret
98
99_key_expansion_256b:
100	pshufd $0b10101010, %xmm1, %xmm1
101	shufps $0b00010000, %xmm2, %xmm4
102	pxor %xmm4, %xmm2
103	shufps $0b10001100, %xmm2, %xmm4
104	pxor %xmm4, %xmm2
105	pxor %xmm1, %xmm2
106	movaps %xmm2, (%rcx)
107	add $0x10, %rcx
108	ret
109
110/*
111 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
112 *                   unsigned int key_len)
113 */
114ENTRY(aesni_set_key)
115	movups (%rsi), %xmm0		# user key (first 16 bytes)
116	movaps %xmm0, (%rdi)
117	lea 0x10(%rdi), %rcx		# key addr
118	movl %edx, 480(%rdi)
119	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
120	cmp $24, %dl
121	jb .Lenc_key128
122	je .Lenc_key192
123	movups 0x10(%rsi), %xmm2	# other user key
124	movaps %xmm2, (%rcx)
125	add $0x10, %rcx
126	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
127	call _key_expansion_256a
128	AESKEYGENASSIST 0x1 %xmm0 %xmm1
129	call _key_expansion_256b
130	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
131	call _key_expansion_256a
132	AESKEYGENASSIST 0x2 %xmm0 %xmm1
133	call _key_expansion_256b
134	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
135	call _key_expansion_256a
136	AESKEYGENASSIST 0x4 %xmm0 %xmm1
137	call _key_expansion_256b
138	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
139	call _key_expansion_256a
140	AESKEYGENASSIST 0x8 %xmm0 %xmm1
141	call _key_expansion_256b
142	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
143	call _key_expansion_256a
144	AESKEYGENASSIST 0x10 %xmm0 %xmm1
145	call _key_expansion_256b
146	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
147	call _key_expansion_256a
148	AESKEYGENASSIST 0x20 %xmm0 %xmm1
149	call _key_expansion_256b
150	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
151	call _key_expansion_256a
152	jmp .Ldec_key
153.Lenc_key192:
154	movq 0x10(%rsi), %xmm2		# other user key
155	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
156	call _key_expansion_192a
157	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
158	call _key_expansion_192b
159	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
160	call _key_expansion_192a
161	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
162	call _key_expansion_192b
163	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
164	call _key_expansion_192a
165	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
166	call _key_expansion_192b
167	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
168	call _key_expansion_192a
169	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
170	call _key_expansion_192b
171	jmp .Ldec_key
172.Lenc_key128:
173	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
174	call _key_expansion_128
175	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
176	call _key_expansion_128
177	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
178	call _key_expansion_128
179	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
180	call _key_expansion_128
181	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
182	call _key_expansion_128
183	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
184	call _key_expansion_128
185	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
186	call _key_expansion_128
187	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
188	call _key_expansion_128
189	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
190	call _key_expansion_128
191	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
192	call _key_expansion_128
193.Ldec_key:
194	sub $0x10, %rcx
195	movaps (%rdi), %xmm0
196	movaps (%rcx), %xmm1
197	movaps %xmm0, 240(%rcx)
198	movaps %xmm1, 240(%rdi)
199	add $0x10, %rdi
200	lea 240-16(%rcx), %rsi
201.align 4
202.Ldec_key_loop:
203	movaps (%rdi), %xmm0
204	AESIMC %xmm0 %xmm1
205	movaps %xmm1, (%rsi)
206	add $0x10, %rdi
207	sub $0x10, %rsi
208	cmp %rcx, %rdi
209	jb .Ldec_key_loop
210	xor %rax, %rax
211	ret
212
213/*
214 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
215 */
216ENTRY(aesni_enc)
217	movl 480(KEYP), KLEN		# key length
218	movups (INP), STATE		# input
219	call _aesni_enc1
220	movups STATE, (OUTP)		# output
221	ret
222
223/*
224 * _aesni_enc1:		internal ABI
225 * input:
226 *	KEYP:		key struct pointer
227 *	KLEN:		round count
228 *	STATE:		initial state (input)
229 * output:
230 *	STATE:		finial state (output)
231 * changed:
232 *	KEY
233 *	TKEYP (T1)
234 */
235_aesni_enc1:
236	movaps (KEYP), KEY		# key
237	mov KEYP, TKEYP
238	pxor KEY, STATE		# round 0
239	add $0x30, TKEYP
240	cmp $24, KLEN
241	jb .Lenc128
242	lea 0x20(TKEYP), TKEYP
243	je .Lenc192
244	add $0x20, TKEYP
245	movaps -0x60(TKEYP), KEY
246	AESENC KEY STATE
247	movaps -0x50(TKEYP), KEY
248	AESENC KEY STATE
249.align 4
250.Lenc192:
251	movaps -0x40(TKEYP), KEY
252	AESENC KEY STATE
253	movaps -0x30(TKEYP), KEY
254	AESENC KEY STATE
255.align 4
256.Lenc128:
257	movaps -0x20(TKEYP), KEY
258	AESENC KEY STATE
259	movaps -0x10(TKEYP), KEY
260	AESENC KEY STATE
261	movaps (TKEYP), KEY
262	AESENC KEY STATE
263	movaps 0x10(TKEYP), KEY
264	AESENC KEY STATE
265	movaps 0x20(TKEYP), KEY
266	AESENC KEY STATE
267	movaps 0x30(TKEYP), KEY
268	AESENC KEY STATE
269	movaps 0x40(TKEYP), KEY
270	AESENC KEY STATE
271	movaps 0x50(TKEYP), KEY
272	AESENC KEY STATE
273	movaps 0x60(TKEYP), KEY
274	AESENC KEY STATE
275	movaps 0x70(TKEYP), KEY
276	AESENCLAST KEY STATE
277	ret
278
279/*
280 * _aesni_enc4:	internal ABI
281 * input:
282 *	KEYP:		key struct pointer
283 *	KLEN:		round count
284 *	STATE1:		initial state (input)
285 *	STATE2
286 *	STATE3
287 *	STATE4
288 * output:
289 *	STATE1:		finial state (output)
290 *	STATE2
291 *	STATE3
292 *	STATE4
293 * changed:
294 *	KEY
295 *	TKEYP (T1)
296 */
297_aesni_enc4:
298	movaps (KEYP), KEY		# key
299	mov KEYP, TKEYP
300	pxor KEY, STATE1		# round 0
301	pxor KEY, STATE2
302	pxor KEY, STATE3
303	pxor KEY, STATE4
304	add $0x30, TKEYP
305	cmp $24, KLEN
306	jb .L4enc128
307	lea 0x20(TKEYP), TKEYP
308	je .L4enc192
309	add $0x20, TKEYP
310	movaps -0x60(TKEYP), KEY
311	AESENC KEY STATE1
312	AESENC KEY STATE2
313	AESENC KEY STATE3
314	AESENC KEY STATE4
315	movaps -0x50(TKEYP), KEY
316	AESENC KEY STATE1
317	AESENC KEY STATE2
318	AESENC KEY STATE3
319	AESENC KEY STATE4
320#.align 4
321.L4enc192:
322	movaps -0x40(TKEYP), KEY
323	AESENC KEY STATE1
324	AESENC KEY STATE2
325	AESENC KEY STATE3
326	AESENC KEY STATE4
327	movaps -0x30(TKEYP), KEY
328	AESENC KEY STATE1
329	AESENC KEY STATE2
330	AESENC KEY STATE3
331	AESENC KEY STATE4
332#.align 4
333.L4enc128:
334	movaps -0x20(TKEYP), KEY
335	AESENC KEY STATE1
336	AESENC KEY STATE2
337	AESENC KEY STATE3
338	AESENC KEY STATE4
339	movaps -0x10(TKEYP), KEY
340	AESENC KEY STATE1
341	AESENC KEY STATE2
342	AESENC KEY STATE3
343	AESENC KEY STATE4
344	movaps (TKEYP), KEY
345	AESENC KEY STATE1
346	AESENC KEY STATE2
347	AESENC KEY STATE3
348	AESENC KEY STATE4
349	movaps 0x10(TKEYP), KEY
350	AESENC KEY STATE1
351	AESENC KEY STATE2
352	AESENC KEY STATE3
353	AESENC KEY STATE4
354	movaps 0x20(TKEYP), KEY
355	AESENC KEY STATE1
356	AESENC KEY STATE2
357	AESENC KEY STATE3
358	AESENC KEY STATE4
359	movaps 0x30(TKEYP), KEY
360	AESENC KEY STATE1
361	AESENC KEY STATE2
362	AESENC KEY STATE3
363	AESENC KEY STATE4
364	movaps 0x40(TKEYP), KEY
365	AESENC KEY STATE1
366	AESENC KEY STATE2
367	AESENC KEY STATE3
368	AESENC KEY STATE4
369	movaps 0x50(TKEYP), KEY
370	AESENC KEY STATE1
371	AESENC KEY STATE2
372	AESENC KEY STATE3
373	AESENC KEY STATE4
374	movaps 0x60(TKEYP), KEY
375	AESENC KEY STATE1
376	AESENC KEY STATE2
377	AESENC KEY STATE3
378	AESENC KEY STATE4
379	movaps 0x70(TKEYP), KEY
380	AESENCLAST KEY STATE1		# last round
381	AESENCLAST KEY STATE2
382	AESENCLAST KEY STATE3
383	AESENCLAST KEY STATE4
384	ret
385
386/*
387 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
388 */
389ENTRY(aesni_dec)
390	mov 480(KEYP), KLEN		# key length
391	add $240, KEYP
392	movups (INP), STATE		# input
393	call _aesni_dec1
394	movups STATE, (OUTP)		#output
395	ret
396
397/*
398 * _aesni_dec1:		internal ABI
399 * input:
400 *	KEYP:		key struct pointer
401 *	KLEN:		key length
402 *	STATE:		initial state (input)
403 * output:
404 *	STATE:		finial state (output)
405 * changed:
406 *	KEY
407 *	TKEYP (T1)
408 */
409_aesni_dec1:
410	movaps (KEYP), KEY		# key
411	mov KEYP, TKEYP
412	pxor KEY, STATE		# round 0
413	add $0x30, TKEYP
414	cmp $24, KLEN
415	jb .Ldec128
416	lea 0x20(TKEYP), TKEYP
417	je .Ldec192
418	add $0x20, TKEYP
419	movaps -0x60(TKEYP), KEY
420	AESDEC KEY STATE
421	movaps -0x50(TKEYP), KEY
422	AESDEC KEY STATE
423.align 4
424.Ldec192:
425	movaps -0x40(TKEYP), KEY
426	AESDEC KEY STATE
427	movaps -0x30(TKEYP), KEY
428	AESDEC KEY STATE
429.align 4
430.Ldec128:
431	movaps -0x20(TKEYP), KEY
432	AESDEC KEY STATE
433	movaps -0x10(TKEYP), KEY
434	AESDEC KEY STATE
435	movaps (TKEYP), KEY
436	AESDEC KEY STATE
437	movaps 0x10(TKEYP), KEY
438	AESDEC KEY STATE
439	movaps 0x20(TKEYP), KEY
440	AESDEC KEY STATE
441	movaps 0x30(TKEYP), KEY
442	AESDEC KEY STATE
443	movaps 0x40(TKEYP), KEY
444	AESDEC KEY STATE
445	movaps 0x50(TKEYP), KEY
446	AESDEC KEY STATE
447	movaps 0x60(TKEYP), KEY
448	AESDEC KEY STATE
449	movaps 0x70(TKEYP), KEY
450	AESDECLAST KEY STATE
451	ret
452
453/*
454 * _aesni_dec4:	internal ABI
455 * input:
456 *	KEYP:		key struct pointer
457 *	KLEN:		key length
458 *	STATE1:		initial state (input)
459 *	STATE2
460 *	STATE3
461 *	STATE4
462 * output:
463 *	STATE1:		finial state (output)
464 *	STATE2
465 *	STATE3
466 *	STATE4
467 * changed:
468 *	KEY
469 *	TKEYP (T1)
470 */
471_aesni_dec4:
472	movaps (KEYP), KEY		# key
473	mov KEYP, TKEYP
474	pxor KEY, STATE1		# round 0
475	pxor KEY, STATE2
476	pxor KEY, STATE3
477	pxor KEY, STATE4
478	add $0x30, TKEYP
479	cmp $24, KLEN
480	jb .L4dec128
481	lea 0x20(TKEYP), TKEYP
482	je .L4dec192
483	add $0x20, TKEYP
484	movaps -0x60(TKEYP), KEY
485	AESDEC KEY STATE1
486	AESDEC KEY STATE2
487	AESDEC KEY STATE3
488	AESDEC KEY STATE4
489	movaps -0x50(TKEYP), KEY
490	AESDEC KEY STATE1
491	AESDEC KEY STATE2
492	AESDEC KEY STATE3
493	AESDEC KEY STATE4
494.align 4
495.L4dec192:
496	movaps -0x40(TKEYP), KEY
497	AESDEC KEY STATE1
498	AESDEC KEY STATE2
499	AESDEC KEY STATE3
500	AESDEC KEY STATE4
501	movaps -0x30(TKEYP), KEY
502	AESDEC KEY STATE1
503	AESDEC KEY STATE2
504	AESDEC KEY STATE3
505	AESDEC KEY STATE4
506.align 4
507.L4dec128:
508	movaps -0x20(TKEYP), KEY
509	AESDEC KEY STATE1
510	AESDEC KEY STATE2
511	AESDEC KEY STATE3
512	AESDEC KEY STATE4
513	movaps -0x10(TKEYP), KEY
514	AESDEC KEY STATE1
515	AESDEC KEY STATE2
516	AESDEC KEY STATE3
517	AESDEC KEY STATE4
518	movaps (TKEYP), KEY
519	AESDEC KEY STATE1
520	AESDEC KEY STATE2
521	AESDEC KEY STATE3
522	AESDEC KEY STATE4
523	movaps 0x10(TKEYP), KEY
524	AESDEC KEY STATE1
525	AESDEC KEY STATE2
526	AESDEC KEY STATE3
527	AESDEC KEY STATE4
528	movaps 0x20(TKEYP), KEY
529	AESDEC KEY STATE1
530	AESDEC KEY STATE2
531	AESDEC KEY STATE3
532	AESDEC KEY STATE4
533	movaps 0x30(TKEYP), KEY
534	AESDEC KEY STATE1
535	AESDEC KEY STATE2
536	AESDEC KEY STATE3
537	AESDEC KEY STATE4
538	movaps 0x40(TKEYP), KEY
539	AESDEC KEY STATE1
540	AESDEC KEY STATE2
541	AESDEC KEY STATE3
542	AESDEC KEY STATE4
543	movaps 0x50(TKEYP), KEY
544	AESDEC KEY STATE1
545	AESDEC KEY STATE2
546	AESDEC KEY STATE3
547	AESDEC KEY STATE4
548	movaps 0x60(TKEYP), KEY
549	AESDEC KEY STATE1
550	AESDEC KEY STATE2
551	AESDEC KEY STATE3
552	AESDEC KEY STATE4
553	movaps 0x70(TKEYP), KEY
554	AESDECLAST KEY STATE1		# last round
555	AESDECLAST KEY STATE2
556	AESDECLAST KEY STATE3
557	AESDECLAST KEY STATE4
558	ret
559
560/*
561 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
562 *		      size_t len)
563 */
564ENTRY(aesni_ecb_enc)
565	test LEN, LEN		# check length
566	jz .Lecb_enc_ret
567	mov 480(KEYP), KLEN
568	cmp $16, LEN
569	jb .Lecb_enc_ret
570	cmp $64, LEN
571	jb .Lecb_enc_loop1
572.align 4
573.Lecb_enc_loop4:
574	movups (INP), STATE1
575	movups 0x10(INP), STATE2
576	movups 0x20(INP), STATE3
577	movups 0x30(INP), STATE4
578	call _aesni_enc4
579	movups STATE1, (OUTP)
580	movups STATE2, 0x10(OUTP)
581	movups STATE3, 0x20(OUTP)
582	movups STATE4, 0x30(OUTP)
583	sub $64, LEN
584	add $64, INP
585	add $64, OUTP
586	cmp $64, LEN
587	jge .Lecb_enc_loop4
588	cmp $16, LEN
589	jb .Lecb_enc_ret
590.align 4
591.Lecb_enc_loop1:
592	movups (INP), STATE1
593	call _aesni_enc1
594	movups STATE1, (OUTP)
595	sub $16, LEN
596	add $16, INP
597	add $16, OUTP
598	cmp $16, LEN
599	jge .Lecb_enc_loop1
600.Lecb_enc_ret:
601	ret
602
603/*
604 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
605 *		      size_t len);
606 */
607ENTRY(aesni_ecb_dec)
608	test LEN, LEN
609	jz .Lecb_dec_ret
610	mov 480(KEYP), KLEN
611	add $240, KEYP
612	cmp $16, LEN
613	jb .Lecb_dec_ret
614	cmp $64, LEN
615	jb .Lecb_dec_loop1
616.align 4
617.Lecb_dec_loop4:
618	movups (INP), STATE1
619	movups 0x10(INP), STATE2
620	movups 0x20(INP), STATE3
621	movups 0x30(INP), STATE4
622	call _aesni_dec4
623	movups STATE1, (OUTP)
624	movups STATE2, 0x10(OUTP)
625	movups STATE3, 0x20(OUTP)
626	movups STATE4, 0x30(OUTP)
627	sub $64, LEN
628	add $64, INP
629	add $64, OUTP
630	cmp $64, LEN
631	jge .Lecb_dec_loop4
632	cmp $16, LEN
633	jb .Lecb_dec_ret
634.align 4
635.Lecb_dec_loop1:
636	movups (INP), STATE1
637	call _aesni_dec1
638	movups STATE1, (OUTP)
639	sub $16, LEN
640	add $16, INP
641	add $16, OUTP
642	cmp $16, LEN
643	jge .Lecb_dec_loop1
644.Lecb_dec_ret:
645	ret
646
647/*
648 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
649 *		      size_t len, u8 *iv)
650 */
651ENTRY(aesni_cbc_enc)
652	cmp $16, LEN
653	jb .Lcbc_enc_ret
654	mov 480(KEYP), KLEN
655	movups (IVP), STATE	# load iv as initial state
656.align 4
657.Lcbc_enc_loop:
658	movups (INP), IN	# load input
659	pxor IN, STATE
660	call _aesni_enc1
661	movups STATE, (OUTP)	# store output
662	sub $16, LEN
663	add $16, INP
664	add $16, OUTP
665	cmp $16, LEN
666	jge .Lcbc_enc_loop
667	movups STATE, (IVP)
668.Lcbc_enc_ret:
669	ret
670
671/*
672 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
673 *		      size_t len, u8 *iv)
674 */
675ENTRY(aesni_cbc_dec)
676	cmp $16, LEN
677	jb .Lcbc_dec_just_ret
678	mov 480(KEYP), KLEN
679	add $240, KEYP
680	movups (IVP), IV
681	cmp $64, LEN
682	jb .Lcbc_dec_loop1
683.align 4
684.Lcbc_dec_loop4:
685	movups (INP), IN1
686	movaps IN1, STATE1
687	movups 0x10(INP), IN2
688	movaps IN2, STATE2
689	movups 0x20(INP), IN3
690	movaps IN3, STATE3
691	movups 0x30(INP), IN4
692	movaps IN4, STATE4
693	call _aesni_dec4
694	pxor IV, STATE1
695	pxor IN1, STATE2
696	pxor IN2, STATE3
697	pxor IN3, STATE4
698	movaps IN4, IV
699	movups STATE1, (OUTP)
700	movups STATE2, 0x10(OUTP)
701	movups STATE3, 0x20(OUTP)
702	movups STATE4, 0x30(OUTP)
703	sub $64, LEN
704	add $64, INP
705	add $64, OUTP
706	cmp $64, LEN
707	jge .Lcbc_dec_loop4
708	cmp $16, LEN
709	jb .Lcbc_dec_ret
710.align 4
711.Lcbc_dec_loop1:
712	movups (INP), IN
713	movaps IN, STATE
714	call _aesni_dec1
715	pxor IV, STATE
716	movups STATE, (OUTP)
717	movaps IN, IV
718	sub $16, LEN
719	add $16, INP
720	add $16, OUTP
721	cmp $16, LEN
722	jge .Lcbc_dec_loop1
723.Lcbc_dec_ret:
724	movups IV, (IVP)
725.Lcbc_dec_just_ret:
726	ret
727