1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b)	a##b
69#define VMOVDQ		vmovdqu
70
71#define xdata0		%xmm0
72#define xdata1		%xmm1
73#define xdata2		%xmm2
74#define xdata3		%xmm3
75#define xdata4		%xmm4
76#define xdata5		%xmm5
77#define xdata6		%xmm6
78#define xdata7		%xmm7
79#define xcounter	%xmm8
80#define xbyteswap	%xmm9
81#define xkey0		%xmm10
82#define xkey3		%xmm11
83#define xkey6		%xmm12
84#define xkey9		%xmm13
85#define xkey4		%xmm11
86#define xkey8		%xmm12
87#define xkey12		%xmm13
88#define xkeyA		%xmm14
89#define xkeyB		%xmm15
90
91#define p_in		%rdi
92#define p_iv		%rsi
93#define p_keys		%rdx
94#define p_out		%rcx
95#define num_bytes	%r8
96
97#define tmp		%r10
98#define	DDQ(i)		CONCAT(ddq_add_,i)
99#define	XMM(i)		CONCAT(%xmm, i)
100#define	DDQ_DATA	0
101#define	XDATA		1
102#define KEY_128		1
103#define KEY_192		2
104#define KEY_256		3
105
106.section .rodata
107.align 16
108
109byteswap_const:
110	.octa 0x000102030405060708090A0B0C0D0E0F
111ddq_add_1:
112	.octa 0x00000000000000000000000000000001
113ddq_add_2:
114	.octa 0x00000000000000000000000000000002
115ddq_add_3:
116	.octa 0x00000000000000000000000000000003
117ddq_add_4:
118	.octa 0x00000000000000000000000000000004
119ddq_add_5:
120	.octa 0x00000000000000000000000000000005
121ddq_add_6:
122	.octa 0x00000000000000000000000000000006
123ddq_add_7:
124	.octa 0x00000000000000000000000000000007
125ddq_add_8:
126	.octa 0x00000000000000000000000000000008
127
128.text
129
130/* generate a unique variable for ddq_add_x */
131
132.macro setddq n
133	var_ddq_add = DDQ(\n)
134.endm
135
136/* generate a unique variable for xmm register */
137.macro setxdata n
138	var_xdata = XMM(\n)
139.endm
140
141/* club the numeric 'id' to the symbol 'name' */
142
143.macro club name, id
144.altmacro
145	.if \name == DDQ_DATA
146		setddq %\id
147	.elseif \name == XDATA
148		setxdata %\id
149	.endif
150.noaltmacro
151.endm
152
153/*
154 * do_aes num_in_par load_keys key_len
155 * This increments p_in, but not p_out
156 */
157.macro do_aes b, k, key_len
158	.set by, \b
159	.set load_keys, \k
160	.set klen, \key_len
161
162	.if (load_keys)
163		vmovdqa	0*16(p_keys), xkey0
164	.endif
165
166	vpshufb	xbyteswap, xcounter, xdata0
167
168	.set i, 1
169	.rept (by - 1)
170		club DDQ_DATA, i
171		club XDATA, i
172		vpaddd	var_ddq_add(%rip), xcounter, var_xdata
173		vpshufb	xbyteswap, var_xdata, var_xdata
174		.set i, (i +1)
175	.endr
176
177	vmovdqa	1*16(p_keys), xkeyA
178
179	vpxor	xkey0, xdata0, xdata0
180	club DDQ_DATA, by
181	vpaddd	var_ddq_add(%rip), xcounter, xcounter
182
183	.set i, 1
184	.rept (by - 1)
185		club XDATA, i
186		vpxor	xkey0, var_xdata, var_xdata
187		.set i, (i +1)
188	.endr
189
190	vmovdqa	2*16(p_keys), xkeyB
191
192	.set i, 0
193	.rept by
194		club XDATA, i
195		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
196		.set i, (i +1)
197	.endr
198
199	.if (klen == KEY_128)
200		.if (load_keys)
201			vmovdqa	3*16(p_keys), xkeyA
202		.endif
203	.else
204		vmovdqa	3*16(p_keys), xkeyA
205	.endif
206
207	.set i, 0
208	.rept by
209		club XDATA, i
210		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
211		.set i, (i +1)
212	.endr
213
214	add	$(16*by), p_in
215
216	.if (klen == KEY_128)
217		vmovdqa	4*16(p_keys), xkey4
218	.else
219		.if (load_keys)
220			vmovdqa	4*16(p_keys), xkey4
221		.endif
222	.endif
223
224	.set i, 0
225	.rept by
226		club XDATA, i
227		vaesenc	xkeyA, var_xdata, var_xdata		/* key 3 */
228		.set i, (i +1)
229	.endr
230
231	vmovdqa	5*16(p_keys), xkeyA
232
233	.set i, 0
234	.rept by
235		club XDATA, i
236		vaesenc	xkey4, var_xdata, var_xdata		/* key 4 */
237		.set i, (i +1)
238	.endr
239
240	.if (klen == KEY_128)
241		.if (load_keys)
242			vmovdqa	6*16(p_keys), xkeyB
243		.endif
244	.else
245		vmovdqa	6*16(p_keys), xkeyB
246	.endif
247
248	.set i, 0
249	.rept by
250		club XDATA, i
251		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
252		.set i, (i +1)
253	.endr
254
255	vmovdqa	7*16(p_keys), xkeyA
256
257	.set i, 0
258	.rept by
259		club XDATA, i
260		vaesenc	xkeyB, var_xdata, var_xdata		/* key 6 */
261		.set i, (i +1)
262	.endr
263
264	.if (klen == KEY_128)
265		vmovdqa	8*16(p_keys), xkey8
266	.else
267		.if (load_keys)
268			vmovdqa	8*16(p_keys), xkey8
269		.endif
270	.endif
271
272	.set i, 0
273	.rept by
274		club XDATA, i
275		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
276		.set i, (i +1)
277	.endr
278
279	.if (klen == KEY_128)
280		.if (load_keys)
281			vmovdqa	9*16(p_keys), xkeyA
282		.endif
283	.else
284		vmovdqa	9*16(p_keys), xkeyA
285	.endif
286
287	.set i, 0
288	.rept by
289		club XDATA, i
290		vaesenc	xkey8, var_xdata, var_xdata		/* key 8 */
291		.set i, (i +1)
292	.endr
293
294	vmovdqa	10*16(p_keys), xkeyB
295
296	.set i, 0
297	.rept by
298		club XDATA, i
299		vaesenc	xkeyA, var_xdata, var_xdata		/* key 9 */
300		.set i, (i +1)
301	.endr
302
303	.if (klen != KEY_128)
304		vmovdqa	11*16(p_keys), xkeyA
305	.endif
306
307	.set i, 0
308	.rept by
309		club XDATA, i
310		/* key 10 */
311		.if (klen == KEY_128)
312			vaesenclast	xkeyB, var_xdata, var_xdata
313		.else
314			vaesenc	xkeyB, var_xdata, var_xdata
315		.endif
316		.set i, (i +1)
317	.endr
318
319	.if (klen != KEY_128)
320		.if (load_keys)
321			vmovdqa	12*16(p_keys), xkey12
322		.endif
323
324		.set i, 0
325		.rept by
326			club XDATA, i
327			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
328			.set i, (i +1)
329		.endr
330
331		.if (klen == KEY_256)
332			vmovdqa	13*16(p_keys), xkeyA
333		.endif
334
335		.set i, 0
336		.rept by
337			club XDATA, i
338			.if (klen == KEY_256)
339				/* key 12 */
340				vaesenc	xkey12, var_xdata, var_xdata
341			.else
342				vaesenclast xkey12, var_xdata, var_xdata
343			.endif
344			.set i, (i +1)
345		.endr
346
347		.if (klen == KEY_256)
348			vmovdqa	14*16(p_keys), xkeyB
349
350			.set i, 0
351			.rept by
352				club XDATA, i
353				/* key 13 */
354				vaesenc	xkeyA, var_xdata, var_xdata
355				.set i, (i +1)
356			.endr
357
358			.set i, 0
359			.rept by
360				club XDATA, i
361				/* key 14 */
362				vaesenclast	xkeyB, var_xdata, var_xdata
363				.set i, (i +1)
364			.endr
365		.endif
366	.endif
367
368	.set i, 0
369	.rept (by / 2)
370		.set j, (i+1)
371		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
372		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
373		club XDATA, i
374		vpxor	xkeyA, var_xdata, var_xdata
375		club XDATA, j
376		vpxor	xkeyB, var_xdata, var_xdata
377		.set i, (i+2)
378	.endr
379
380	.if (i < by)
381		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
382		club XDATA, i
383		vpxor	xkeyA, var_xdata, var_xdata
384	.endif
385
386	.set i, 0
387	.rept by
388		club XDATA, i
389		VMOVDQ	var_xdata, i*16(p_out)
390		.set i, (i+1)
391	.endr
392.endm
393
394.macro do_aes_load val, key_len
395	do_aes \val, 1, \key_len
396.endm
397
398.macro do_aes_noload val, key_len
399	do_aes \val, 0, \key_len
400.endm
401
402/* main body of aes ctr load */
403
404.macro do_aes_ctrmain key_len
405
406	cmp	$16, num_bytes
407	jb	.Ldo_return2\key_len
408
409	vmovdqa	byteswap_const(%rip), xbyteswap
410	vmovdqu	(p_iv), xcounter
411	vpshufb	xbyteswap, xcounter, xcounter
412
413	mov	num_bytes, tmp
414	and	$(7*16), tmp
415	jz	.Lmult_of_8_blks\key_len
416
417	/* 1 <= tmp <= 7 */
418	cmp	$(4*16), tmp
419	jg	.Lgt4\key_len
420	je	.Leq4\key_len
421
422.Llt4\key_len:
423	cmp	$(2*16), tmp
424	jg	.Leq3\key_len
425	je	.Leq2\key_len
426
427.Leq1\key_len:
428	do_aes_load	1, \key_len
429	add	$(1*16), p_out
430	and	$(~7*16), num_bytes
431	jz	.Ldo_return2\key_len
432	jmp	.Lmain_loop2\key_len
433
434.Leq2\key_len:
435	do_aes_load	2, \key_len
436	add	$(2*16), p_out
437	and	$(~7*16), num_bytes
438	jz	.Ldo_return2\key_len
439	jmp	.Lmain_loop2\key_len
440
441
442.Leq3\key_len:
443	do_aes_load	3, \key_len
444	add	$(3*16), p_out
445	and	$(~7*16), num_bytes
446	jz	.Ldo_return2\key_len
447	jmp	.Lmain_loop2\key_len
448
449.Leq4\key_len:
450	do_aes_load	4, \key_len
451	add	$(4*16), p_out
452	and	$(~7*16), num_bytes
453	jz	.Ldo_return2\key_len
454	jmp	.Lmain_loop2\key_len
455
456.Lgt4\key_len:
457	cmp	$(6*16), tmp
458	jg	.Leq7\key_len
459	je	.Leq6\key_len
460
461.Leq5\key_len:
462	do_aes_load	5, \key_len
463	add	$(5*16), p_out
464	and	$(~7*16), num_bytes
465	jz	.Ldo_return2\key_len
466	jmp	.Lmain_loop2\key_len
467
468.Leq6\key_len:
469	do_aes_load	6, \key_len
470	add	$(6*16), p_out
471	and	$(~7*16), num_bytes
472	jz	.Ldo_return2\key_len
473	jmp	.Lmain_loop2\key_len
474
475.Leq7\key_len:
476	do_aes_load	7, \key_len
477	add	$(7*16), p_out
478	and	$(~7*16), num_bytes
479	jz	.Ldo_return2\key_len
480	jmp	.Lmain_loop2\key_len
481
482.Lmult_of_8_blks\key_len:
483	.if (\key_len != KEY_128)
484		vmovdqa	0*16(p_keys), xkey0
485		vmovdqa	4*16(p_keys), xkey4
486		vmovdqa	8*16(p_keys), xkey8
487		vmovdqa	12*16(p_keys), xkey12
488	.else
489		vmovdqa	0*16(p_keys), xkey0
490		vmovdqa	3*16(p_keys), xkey4
491		vmovdqa	6*16(p_keys), xkey8
492		vmovdqa	9*16(p_keys), xkey12
493	.endif
494.align 16
495.Lmain_loop2\key_len:
496	/* num_bytes is a multiple of 8 and >0 */
497	do_aes_noload	8, \key_len
498	add	$(8*16), p_out
499	sub	$(8*16), num_bytes
500	jne	.Lmain_loop2\key_len
501
502.Ldo_return2\key_len:
503	/* return updated IV */
504	vpshufb	xbyteswap, xcounter, xcounter
505	vmovdqu	xcounter, (p_iv)
506	ret
507.endm
508
509/*
510 * routine to do AES128 CTR enc/decrypt "by8"
511 * XMM registers are clobbered.
512 * Saving/restoring must be done at a higher level
513 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
514 *			unsigned int num_bytes)
515 */
516ENTRY(aes_ctr_enc_128_avx_by8)
517	/* call the aes main loop */
518	do_aes_ctrmain KEY_128
519
520ENDPROC(aes_ctr_enc_128_avx_by8)
521
522/*
523 * routine to do AES192 CTR enc/decrypt "by8"
524 * XMM registers are clobbered.
525 * Saving/restoring must be done at a higher level
526 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
527 *			unsigned int num_bytes)
528 */
529ENTRY(aes_ctr_enc_192_avx_by8)
530	/* call the aes main loop */
531	do_aes_ctrmain KEY_192
532
533ENDPROC(aes_ctr_enc_192_avx_by8)
534
535/*
536 * routine to do AES256 CTR enc/decrypt "by8"
537 * XMM registers are clobbered.
538 * Saving/restoring must be done at a higher level
539 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
540 *			unsigned int num_bytes)
541 */
542ENTRY(aes_ctr_enc_256_avx_by8)
543	/* call the aes main loop */
544	do_aes_ctrmain KEY_256
545
546ENDPROC(aes_ctr_enc_256_avx_by8)
547