xref: /openbmc/linux/arch/x86/crypto/aesni-intel_asm.S (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 *    Author: Huang Ying <ying.huang@intel.com>
9 *            Vinodh Gopal <vinodh.gopal@intel.com>
10 *            Kahraman Akdemir
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 */
17
18#include <linux/linkage.h>
19
20.text
21
22#define STATE1	%xmm0
23#define STATE2	%xmm4
24#define STATE3	%xmm5
25#define STATE4	%xmm6
26#define STATE	STATE1
27#define IN1	%xmm1
28#define IN2	%xmm7
29#define IN3	%xmm8
30#define IN4	%xmm9
31#define IN	IN1
32#define KEY	%xmm2
33#define IV	%xmm3
34
35#define KEYP	%rdi
36#define OUTP	%rsi
37#define INP	%rdx
38#define LEN	%rcx
39#define IVP	%r8
40#define KLEN	%r9d
41#define T1	%r10
42#define TKEYP	T1
43#define T2	%r11
44
45_key_expansion_128:
46_key_expansion_256a:
47	pshufd $0b11111111, %xmm1, %xmm1
48	shufps $0b00010000, %xmm0, %xmm4
49	pxor %xmm4, %xmm0
50	shufps $0b10001100, %xmm0, %xmm4
51	pxor %xmm4, %xmm0
52	pxor %xmm1, %xmm0
53	movaps %xmm0, (%rcx)
54	add $0x10, %rcx
55	ret
56
57_key_expansion_192a:
58	pshufd $0b01010101, %xmm1, %xmm1
59	shufps $0b00010000, %xmm0, %xmm4
60	pxor %xmm4, %xmm0
61	shufps $0b10001100, %xmm0, %xmm4
62	pxor %xmm4, %xmm0
63	pxor %xmm1, %xmm0
64
65	movaps %xmm2, %xmm5
66	movaps %xmm2, %xmm6
67	pslldq $4, %xmm5
68	pshufd $0b11111111, %xmm0, %xmm3
69	pxor %xmm3, %xmm2
70	pxor %xmm5, %xmm2
71
72	movaps %xmm0, %xmm1
73	shufps $0b01000100, %xmm0, %xmm6
74	movaps %xmm6, (%rcx)
75	shufps $0b01001110, %xmm2, %xmm1
76	movaps %xmm1, 16(%rcx)
77	add $0x20, %rcx
78	ret
79
80_key_expansion_192b:
81	pshufd $0b01010101, %xmm1, %xmm1
82	shufps $0b00010000, %xmm0, %xmm4
83	pxor %xmm4, %xmm0
84	shufps $0b10001100, %xmm0, %xmm4
85	pxor %xmm4, %xmm0
86	pxor %xmm1, %xmm0
87
88	movaps %xmm2, %xmm5
89	pslldq $4, %xmm5
90	pshufd $0b11111111, %xmm0, %xmm3
91	pxor %xmm3, %xmm2
92	pxor %xmm5, %xmm2
93
94	movaps %xmm0, (%rcx)
95	add $0x10, %rcx
96	ret
97
98_key_expansion_256b:
99	pshufd $0b10101010, %xmm1, %xmm1
100	shufps $0b00010000, %xmm2, %xmm4
101	pxor %xmm4, %xmm2
102	shufps $0b10001100, %xmm2, %xmm4
103	pxor %xmm4, %xmm2
104	pxor %xmm1, %xmm2
105	movaps %xmm2, (%rcx)
106	add $0x10, %rcx
107	ret
108
109/*
110 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
111 *                   unsigned int key_len)
112 */
113ENTRY(aesni_set_key)
114	movups (%rsi), %xmm0		# user key (first 16 bytes)
115	movaps %xmm0, (%rdi)
116	lea 0x10(%rdi), %rcx		# key addr
117	movl %edx, 480(%rdi)
118	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
119	cmp $24, %dl
120	jb .Lenc_key128
121	je .Lenc_key192
122	movups 0x10(%rsi), %xmm2	# other user key
123	movaps %xmm2, (%rcx)
124	add $0x10, %rcx
125	# aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
126	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127	call _key_expansion_256a
128	# aeskeygenassist $0x1, %xmm0, %xmm1
129	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130	call _key_expansion_256b
131	# aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
132	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133	call _key_expansion_256a
134	# aeskeygenassist $0x2, %xmm0, %xmm1
135	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136	call _key_expansion_256b
137	# aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
138	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139	call _key_expansion_256a
140	# aeskeygenassist $0x4, %xmm0, %xmm1
141	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142	call _key_expansion_256b
143	# aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
144	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145	call _key_expansion_256a
146	# aeskeygenassist $0x8, %xmm0, %xmm1
147	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148	call _key_expansion_256b
149	# aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
150	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151	call _key_expansion_256a
152	# aeskeygenassist $0x10, %xmm0, %xmm1
153	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154	call _key_expansion_256b
155	# aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
156	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157	call _key_expansion_256a
158	# aeskeygenassist $0x20, %xmm0, %xmm1
159	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160	call _key_expansion_256b
161	# aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
162	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163	call _key_expansion_256a
164	jmp .Ldec_key
165.Lenc_key192:
166	movq 0x10(%rsi), %xmm2		# other user key
167	# aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
168	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169	call _key_expansion_192a
170	# aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
171	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172	call _key_expansion_192b
173	# aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
174	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175	call _key_expansion_192a
176	# aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
177	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178	call _key_expansion_192b
179	# aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
180	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181	call _key_expansion_192a
182	# aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
183	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184	call _key_expansion_192b
185	# aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
186	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187	call _key_expansion_192a
188	# aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
189	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190	call _key_expansion_192b
191	jmp .Ldec_key
192.Lenc_key128:
193	# aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
194	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195	call _key_expansion_128
196	# aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
197	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198	call _key_expansion_128
199	# aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
200	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201	call _key_expansion_128
202	# aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
203	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204	call _key_expansion_128
205	# aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
206	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207	call _key_expansion_128
208	# aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
209	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210	call _key_expansion_128
211	# aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
212	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213	call _key_expansion_128
214	# aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
215	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216	call _key_expansion_128
217	# aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
218	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219	call _key_expansion_128
220	# aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
221	.byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222	call _key_expansion_128
223.Ldec_key:
224	sub $0x10, %rcx
225	movaps (%rdi), %xmm0
226	movaps (%rcx), %xmm1
227	movaps %xmm0, 240(%rcx)
228	movaps %xmm1, 240(%rdi)
229	add $0x10, %rdi
230	lea 240-16(%rcx), %rsi
231.align 4
232.Ldec_key_loop:
233	movaps (%rdi), %xmm0
234	# aesimc %xmm0, %xmm1
235	.byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236	movaps %xmm1, (%rsi)
237	add $0x10, %rdi
238	sub $0x10, %rsi
239	cmp %rcx, %rdi
240	jb .Ldec_key_loop
241	xor %rax, %rax
242	ret
243
244/*
245 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
246 */
247ENTRY(aesni_enc)
248	movl 480(KEYP), KLEN		# key length
249	movups (INP), STATE		# input
250	call _aesni_enc1
251	movups STATE, (OUTP)		# output
252	ret
253
254/*
255 * _aesni_enc1:		internal ABI
256 * input:
257 *	KEYP:		key struct pointer
258 *	KLEN:		round count
259 *	STATE:		initial state (input)
260 * output:
261 *	STATE:		finial state (output)
262 * changed:
263 *	KEY
264 *	TKEYP (T1)
265 */
266_aesni_enc1:
267	movaps (KEYP), KEY		# key
268	mov KEYP, TKEYP
269	pxor KEY, STATE		# round 0
270	add $0x30, TKEYP
271	cmp $24, KLEN
272	jb .Lenc128
273	lea 0x20(TKEYP), TKEYP
274	je .Lenc192
275	add $0x20, TKEYP
276	movaps -0x60(TKEYP), KEY
277	# aesenc KEY, STATE
278	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279	movaps -0x50(TKEYP), KEY
280	# aesenc KEY, STATE
281	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4
283.Lenc192:
284	movaps -0x40(TKEYP), KEY
285	# aesenc KEY, STATE
286	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287	movaps -0x30(TKEYP), KEY
288	# aesenc KEY, STATE
289	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4
291.Lenc128:
292	movaps -0x20(TKEYP), KEY
293	# aesenc KEY, STATE
294	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295	movaps -0x10(TKEYP), KEY
296	# aesenc KEY, STATE
297	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298	movaps (TKEYP), KEY
299	# aesenc KEY, STATE
300	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301	movaps 0x10(TKEYP), KEY
302	# aesenc KEY, STATE
303	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304	movaps 0x20(TKEYP), KEY
305	# aesenc KEY, STATE
306	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307	movaps 0x30(TKEYP), KEY
308	# aesenc KEY, STATE
309	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310	movaps 0x40(TKEYP), KEY
311	# aesenc KEY, STATE
312	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313	movaps 0x50(TKEYP), KEY
314	# aesenc KEY, STATE
315	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316	movaps 0x60(TKEYP), KEY
317	# aesenc KEY, STATE
318	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319	movaps 0x70(TKEYP), KEY
320	# aesenclast KEY, STATE	# last round
321	.byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322	ret
323
324/*
325 * _aesni_enc4:	internal ABI
326 * input:
327 *	KEYP:		key struct pointer
328 *	KLEN:		round count
329 *	STATE1:		initial state (input)
330 *	STATE2
331 *	STATE3
332 *	STATE4
333 * output:
334 *	STATE1:		finial state (output)
335 *	STATE2
336 *	STATE3
337 *	STATE4
338 * changed:
339 *	KEY
340 *	TKEYP (T1)
341 */
342_aesni_enc4:
343	movaps (KEYP), KEY		# key
344	mov KEYP, TKEYP
345	pxor KEY, STATE1		# round 0
346	pxor KEY, STATE2
347	pxor KEY, STATE3
348	pxor KEY, STATE4
349	add $0x30, TKEYP
350	cmp $24, KLEN
351	jb .L4enc128
352	lea 0x20(TKEYP), TKEYP
353	je .L4enc192
354	add $0x20, TKEYP
355	movaps -0x60(TKEYP), KEY
356	# aesenc KEY, STATE1
357	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
358	# aesenc KEY, STATE2
359	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
360	# aesenc KEY, STATE3
361	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362	# aesenc KEY, STATE4
363	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364	movaps -0x50(TKEYP), KEY
365	# aesenc KEY, STATE1
366	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
367	# aesenc KEY, STATE2
368	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
369	# aesenc KEY, STATE3
370	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371	# aesenc KEY, STATE4
372	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4
374.L4enc192:
375	movaps -0x40(TKEYP), KEY
376	# aesenc KEY, STATE1
377	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
378	# aesenc KEY, STATE2
379	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
380	# aesenc KEY, STATE3
381	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382	# aesenc KEY, STATE4
383	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384	movaps -0x30(TKEYP), KEY
385	# aesenc KEY, STATE1
386	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
387	# aesenc KEY, STATE2
388	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
389	# aesenc KEY, STATE3
390	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391	# aesenc KEY, STATE4
392	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4
394.L4enc128:
395	movaps -0x20(TKEYP), KEY
396	# aesenc KEY, STATE1
397	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
398	# aesenc KEY, STATE2
399	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
400	# aesenc KEY, STATE3
401	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402	# aesenc KEY, STATE4
403	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404	movaps -0x10(TKEYP), KEY
405	# aesenc KEY, STATE1
406	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
407	# aesenc KEY, STATE2
408	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
409	# aesenc KEY, STATE3
410	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411	# aesenc KEY, STATE4
412	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413	movaps (TKEYP), KEY
414	# aesenc KEY, STATE1
415	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
416	# aesenc KEY, STATE2
417	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
418	# aesenc KEY, STATE3
419	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420	# aesenc KEY, STATE4
421	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422	movaps 0x10(TKEYP), KEY
423	# aesenc KEY, STATE1
424	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
425	# aesenc KEY, STATE2
426	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
427	# aesenc KEY, STATE3
428	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429	# aesenc KEY, STATE4
430	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431	movaps 0x20(TKEYP), KEY
432	# aesenc KEY, STATE1
433	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
434	# aesenc KEY, STATE2
435	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
436	# aesenc KEY, STATE3
437	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438	# aesenc KEY, STATE4
439	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440	movaps 0x30(TKEYP), KEY
441	# aesenc KEY, STATE1
442	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
443	# aesenc KEY, STATE2
444	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
445	# aesenc KEY, STATE3
446	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447	# aesenc KEY, STATE4
448	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449	movaps 0x40(TKEYP), KEY
450	# aesenc KEY, STATE1
451	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
452	# aesenc KEY, STATE2
453	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
454	# aesenc KEY, STATE3
455	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456	# aesenc KEY, STATE4
457	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458	movaps 0x50(TKEYP), KEY
459	# aesenc KEY, STATE1
460	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
461	# aesenc KEY, STATE2
462	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
463	# aesenc KEY, STATE3
464	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465	# aesenc KEY, STATE4
466	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467	movaps 0x60(TKEYP), KEY
468	# aesenc KEY, STATE1
469	.byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
470	# aesenc KEY, STATE2
471	.byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
472	# aesenc KEY, STATE3
473	.byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474	# aesenc KEY, STATE4
475	.byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476	movaps 0x70(TKEYP), KEY
477	# aesenclast KEY, STATE1	# last round
478	.byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
479	# aesenclast KEY, STATE2
480	.byte 0x66, 0x0f, 0x38, 0xdd, 0xe2
481	# aesenclast KEY, STATE3
482	.byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483	# aesenclast KEY, STATE4
484	.byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485	ret
486
487/*
488 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
489 */
490ENTRY(aesni_dec)
491	mov 480(KEYP), KLEN		# key length
492	add $240, KEYP
493	movups (INP), STATE		# input
494	call _aesni_dec1
495	movups STATE, (OUTP)		#output
496	ret
497
498/*
499 * _aesni_dec1:		internal ABI
500 * input:
501 *	KEYP:		key struct pointer
502 *	KLEN:		key length
503 *	STATE:		initial state (input)
504 * output:
505 *	STATE:		finial state (output)
506 * changed:
507 *	KEY
508 *	TKEYP (T1)
509 */
510_aesni_dec1:
511	movaps (KEYP), KEY		# key
512	mov KEYP, TKEYP
513	pxor KEY, STATE		# round 0
514	add $0x30, TKEYP
515	cmp $24, KLEN
516	jb .Ldec128
517	lea 0x20(TKEYP), TKEYP
518	je .Ldec192
519	add $0x20, TKEYP
520	movaps -0x60(TKEYP), KEY
521	# aesdec KEY, STATE
522	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523	movaps -0x50(TKEYP), KEY
524	# aesdec KEY, STATE
525	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4
527.Ldec192:
528	movaps -0x40(TKEYP), KEY
529	# aesdec KEY, STATE
530	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531	movaps -0x30(TKEYP), KEY
532	# aesdec KEY, STATE
533	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4
535.Ldec128:
536	movaps -0x20(TKEYP), KEY
537	# aesdec KEY, STATE
538	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539	movaps -0x10(TKEYP), KEY
540	# aesdec KEY, STATE
541	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542	movaps (TKEYP), KEY
543	# aesdec KEY, STATE
544	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545	movaps 0x10(TKEYP), KEY
546	# aesdec KEY, STATE
547	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548	movaps 0x20(TKEYP), KEY
549	# aesdec KEY, STATE
550	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551	movaps 0x30(TKEYP), KEY
552	# aesdec KEY, STATE
553	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554	movaps 0x40(TKEYP), KEY
555	# aesdec KEY, STATE
556	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557	movaps 0x50(TKEYP), KEY
558	# aesdec KEY, STATE
559	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560	movaps 0x60(TKEYP), KEY
561	# aesdec KEY, STATE
562	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563	movaps 0x70(TKEYP), KEY
564	# aesdeclast KEY, STATE		# last round
565	.byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566	ret
567
568/*
569 * _aesni_dec4:	internal ABI
570 * input:
571 *	KEYP:		key struct pointer
572 *	KLEN:		key length
573 *	STATE1:		initial state (input)
574 *	STATE2
575 *	STATE3
576 *	STATE4
577 * output:
578 *	STATE1:		finial state (output)
579 *	STATE2
580 *	STATE3
581 *	STATE4
582 * changed:
583 *	KEY
584 *	TKEYP (T1)
585 */
586_aesni_dec4:
587	movaps (KEYP), KEY		# key
588	mov KEYP, TKEYP
589	pxor KEY, STATE1		# round 0
590	pxor KEY, STATE2
591	pxor KEY, STATE3
592	pxor KEY, STATE4
593	add $0x30, TKEYP
594	cmp $24, KLEN
595	jb .L4dec128
596	lea 0x20(TKEYP), TKEYP
597	je .L4dec192
598	add $0x20, TKEYP
599	movaps -0x60(TKEYP), KEY
600	# aesdec KEY, STATE1
601	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
602	# aesdec KEY, STATE2
603	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
604	# aesdec KEY, STATE3
605	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
606	# aesdec KEY, STATE4
607	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608	movaps -0x50(TKEYP), KEY
609	# aesdec KEY, STATE1
610	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
611	# aesdec KEY, STATE2
612	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
613	# aesdec KEY, STATE3
614	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
615	# aesdec KEY, STATE4
616	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4
618.L4dec192:
619	movaps -0x40(TKEYP), KEY
620	# aesdec KEY, STATE1
621	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
622	# aesdec KEY, STATE2
623	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
624	# aesdec KEY, STATE3
625	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
626	# aesdec KEY, STATE4
627	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628	movaps -0x30(TKEYP), KEY
629	# aesdec KEY, STATE1
630	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
631	# aesdec KEY, STATE2
632	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
633	# aesdec KEY, STATE3
634	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
635	# aesdec KEY, STATE4
636	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4
638.L4dec128:
639	movaps -0x20(TKEYP), KEY
640	# aesdec KEY, STATE1
641	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
642	# aesdec KEY, STATE2
643	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
644	# aesdec KEY, STATE3
645	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
646	# aesdec KEY, STATE4
647	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648	movaps -0x10(TKEYP), KEY
649	# aesdec KEY, STATE1
650	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
651	# aesdec KEY, STATE2
652	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
653	# aesdec KEY, STATE3
654	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
655	# aesdec KEY, STATE4
656	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657	movaps (TKEYP), KEY
658	# aesdec KEY, STATE1
659	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
660	# aesdec KEY, STATE2
661	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
662	# aesdec KEY, STATE3
663	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
664	# aesdec KEY, STATE4
665	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666	movaps 0x10(TKEYP), KEY
667	# aesdec KEY, STATE1
668	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
669	# aesdec KEY, STATE2
670	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
671	# aesdec KEY, STATE3
672	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
673	# aesdec KEY, STATE4
674	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675	movaps 0x20(TKEYP), KEY
676	# aesdec KEY, STATE1
677	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
678	# aesdec KEY, STATE2
679	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
680	# aesdec KEY, STATE3
681	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
682	# aesdec KEY, STATE4
683	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684	movaps 0x30(TKEYP), KEY
685	# aesdec KEY, STATE1
686	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
687	# aesdec KEY, STATE2
688	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
689	# aesdec KEY, STATE3
690	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
691	# aesdec KEY, STATE4
692	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693	movaps 0x40(TKEYP), KEY
694	# aesdec KEY, STATE1
695	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
696	# aesdec KEY, STATE2
697	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
698	# aesdec KEY, STATE3
699	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
700	# aesdec KEY, STATE4
701	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702	movaps 0x50(TKEYP), KEY
703	# aesdec KEY, STATE1
704	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
705	# aesdec KEY, STATE2
706	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
707	# aesdec KEY, STATE3
708	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
709	# aesdec KEY, STATE4
710	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711	movaps 0x60(TKEYP), KEY
712	# aesdec KEY, STATE1
713	.byte 0x66, 0x0f, 0x38, 0xde, 0xc2
714	# aesdec KEY, STATE2
715	.byte 0x66, 0x0f, 0x38, 0xde, 0xe2
716	# aesdec KEY, STATE3
717	.byte 0x66, 0x0f, 0x38, 0xde, 0xea
718	# aesdec KEY, STATE4
719	.byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720	movaps 0x70(TKEYP), KEY
721	# aesdeclast KEY, STATE1	# last round
722	.byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
723	# aesdeclast KEY, STATE2
724	.byte 0x66, 0x0f, 0x38, 0xdf, 0xe2
725	# aesdeclast KEY, STATE3
726	.byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727	# aesdeclast KEY, STATE4
728	.byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729	ret
730
731/*
732 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
733 *		      size_t len)
734 */
735ENTRY(aesni_ecb_enc)
736	test LEN, LEN		# check length
737	jz .Lecb_enc_ret
738	mov 480(KEYP), KLEN
739	cmp $16, LEN
740	jb .Lecb_enc_ret
741	cmp $64, LEN
742	jb .Lecb_enc_loop1
743.align 4
744.Lecb_enc_loop4:
745	movups (INP), STATE1
746	movups 0x10(INP), STATE2
747	movups 0x20(INP), STATE3
748	movups 0x30(INP), STATE4
749	call _aesni_enc4
750	movups STATE1, (OUTP)
751	movups STATE2, 0x10(OUTP)
752	movups STATE3, 0x20(OUTP)
753	movups STATE4, 0x30(OUTP)
754	sub $64, LEN
755	add $64, INP
756	add $64, OUTP
757	cmp $64, LEN
758	jge .Lecb_enc_loop4
759	cmp $16, LEN
760	jb .Lecb_enc_ret
761.align 4
762.Lecb_enc_loop1:
763	movups (INP), STATE1
764	call _aesni_enc1
765	movups STATE1, (OUTP)
766	sub $16, LEN
767	add $16, INP
768	add $16, OUTP
769	cmp $16, LEN
770	jge .Lecb_enc_loop1
771.Lecb_enc_ret:
772	ret
773
774/*
775 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
776 *		      size_t len);
777 */
778ENTRY(aesni_ecb_dec)
779	test LEN, LEN
780	jz .Lecb_dec_ret
781	mov 480(KEYP), KLEN
782	add $240, KEYP
783	cmp $16, LEN
784	jb .Lecb_dec_ret
785	cmp $64, LEN
786	jb .Lecb_dec_loop1
787.align 4
788.Lecb_dec_loop4:
789	movups (INP), STATE1
790	movups 0x10(INP), STATE2
791	movups 0x20(INP), STATE3
792	movups 0x30(INP), STATE4
793	call _aesni_dec4
794	movups STATE1, (OUTP)
795	movups STATE2, 0x10(OUTP)
796	movups STATE3, 0x20(OUTP)
797	movups STATE4, 0x30(OUTP)
798	sub $64, LEN
799	add $64, INP
800	add $64, OUTP
801	cmp $64, LEN
802	jge .Lecb_dec_loop4
803	cmp $16, LEN
804	jb .Lecb_dec_ret
805.align 4
806.Lecb_dec_loop1:
807	movups (INP), STATE1
808	call _aesni_dec1
809	movups STATE1, (OUTP)
810	sub $16, LEN
811	add $16, INP
812	add $16, OUTP
813	cmp $16, LEN
814	jge .Lecb_dec_loop1
815.Lecb_dec_ret:
816	ret
817
818/*
819 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
820 *		      size_t len, u8 *iv)
821 */
822ENTRY(aesni_cbc_enc)
823	cmp $16, LEN
824	jb .Lcbc_enc_ret
825	mov 480(KEYP), KLEN
826	movups (IVP), STATE	# load iv as initial state
827.align 4
828.Lcbc_enc_loop:
829	movups (INP), IN	# load input
830	pxor IN, STATE
831	call _aesni_enc1
832	movups STATE, (OUTP)	# store output
833	sub $16, LEN
834	add $16, INP
835	add $16, OUTP
836	cmp $16, LEN
837	jge .Lcbc_enc_loop
838	movups STATE, (IVP)
839.Lcbc_enc_ret:
840	ret
841
842/*
843 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
844 *		      size_t len, u8 *iv)
845 */
846ENTRY(aesni_cbc_dec)
847	cmp $16, LEN
848	jb .Lcbc_dec_ret
849	mov 480(KEYP), KLEN
850	add $240, KEYP
851	movups (IVP), IV
852	cmp $64, LEN
853	jb .Lcbc_dec_loop1
854.align 4
855.Lcbc_dec_loop4:
856	movups (INP), IN1
857	movaps IN1, STATE1
858	movups 0x10(INP), IN2
859	movaps IN2, STATE2
860	movups 0x20(INP), IN3
861	movaps IN3, STATE3
862	movups 0x30(INP), IN4
863	movaps IN4, STATE4
864	call _aesni_dec4
865	pxor IV, STATE1
866	pxor IN1, STATE2
867	pxor IN2, STATE3
868	pxor IN3, STATE4
869	movaps IN4, IV
870	movups STATE1, (OUTP)
871	movups STATE2, 0x10(OUTP)
872	movups STATE3, 0x20(OUTP)
873	movups STATE4, 0x30(OUTP)
874	sub $64, LEN
875	add $64, INP
876	add $64, OUTP
877	cmp $64, LEN
878	jge .Lcbc_dec_loop4
879	cmp $16, LEN
880	jb .Lcbc_dec_ret
881.align 4
882.Lcbc_dec_loop1:
883	movups (INP), IN
884	movaps IN, STATE
885	call _aesni_dec1
886	pxor IV, STATE
887	movups STATE, (OUTP)
888	movaps IN, IV
889	sub $16, LEN
890	add $16, INP
891	add $16, OUTP
892	cmp $16, LEN
893	jge .Lcbc_dec_loop1
894	movups IV, (IVP)
895.Lcbc_dec_ret:
896	ret
897