1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define VMOVDQ		vmovdqu
69
70#define xdata0		%xmm0
71#define xdata1		%xmm1
72#define xdata2		%xmm2
73#define xdata3		%xmm3
74#define xdata4		%xmm4
75#define xdata5		%xmm5
76#define xdata6		%xmm6
77#define xdata7		%xmm7
78#define xcounter	%xmm8
79#define xbyteswap	%xmm9
80#define xkey0		%xmm10
81#define xkey4		%xmm11
82#define xkey8		%xmm12
83#define xkey12		%xmm13
84#define xkeyA		%xmm14
85#define xkeyB		%xmm15
86
87#define p_in		%rdi
88#define p_iv		%rsi
89#define p_keys		%rdx
90#define p_out		%rcx
91#define num_bytes	%r8
92
93#define tmp		%r10
94#define	DDQ_DATA	0
95#define	XDATA		1
96#define KEY_128		1
97#define KEY_192		2
98#define KEY_256		3
99
100.section .rodata
101.align 16
102
103byteswap_const:
104	.octa 0x000102030405060708090A0B0C0D0E0F
105ddq_low_msk:
106	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
107ddq_high_add_1:
108	.octa 0x00000000000000010000000000000000
109ddq_add_1:
110	.octa 0x00000000000000000000000000000001
111ddq_add_2:
112	.octa 0x00000000000000000000000000000002
113ddq_add_3:
114	.octa 0x00000000000000000000000000000003
115ddq_add_4:
116	.octa 0x00000000000000000000000000000004
117ddq_add_5:
118	.octa 0x00000000000000000000000000000005
119ddq_add_6:
120	.octa 0x00000000000000000000000000000006
121ddq_add_7:
122	.octa 0x00000000000000000000000000000007
123ddq_add_8:
124	.octa 0x00000000000000000000000000000008
125
126.text
127
128/* generate a unique variable for ddq_add_x */
129
130.macro setddq n
131	var_ddq_add = ddq_add_\n
132.endm
133
134/* generate a unique variable for xmm register */
135.macro setxdata n
136	var_xdata = %xmm\n
137.endm
138
139/* club the numeric 'id' to the symbol 'name' */
140
141.macro club name, id
142.altmacro
143	.if \name == DDQ_DATA
144		setddq %\id
145	.elseif \name == XDATA
146		setxdata %\id
147	.endif
148.noaltmacro
149.endm
150
151/*
152 * do_aes num_in_par load_keys key_len
153 * This increments p_in, but not p_out
154 */
155.macro do_aes b, k, key_len
156	.set by, \b
157	.set load_keys, \k
158	.set klen, \key_len
159
160	.if (load_keys)
161		vmovdqa	0*16(p_keys), xkey0
162	.endif
163
164	vpshufb	xbyteswap, xcounter, xdata0
165
166	.set i, 1
167	.rept (by - 1)
168		club DDQ_DATA, i
169		club XDATA, i
170		vpaddq	var_ddq_add(%rip), xcounter, var_xdata
171		vptest	ddq_low_msk(%rip), var_xdata
172		jnz 1f
173		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
174		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
175		1:
176		vpshufb	xbyteswap, var_xdata, var_xdata
177		.set i, (i +1)
178	.endr
179
180	vmovdqa	1*16(p_keys), xkeyA
181
182	vpxor	xkey0, xdata0, xdata0
183	club DDQ_DATA, by
184	vpaddq	var_ddq_add(%rip), xcounter, xcounter
185	vptest	ddq_low_msk(%rip), xcounter
186	jnz	1f
187	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
188	1:
189
190	.set i, 1
191	.rept (by - 1)
192		club XDATA, i
193		vpxor	xkey0, var_xdata, var_xdata
194		.set i, (i +1)
195	.endr
196
197	vmovdqa	2*16(p_keys), xkeyB
198
199	.set i, 0
200	.rept by
201		club XDATA, i
202		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
203		.set i, (i +1)
204	.endr
205
206	.if (klen == KEY_128)
207		.if (load_keys)
208			vmovdqa	3*16(p_keys), xkey4
209		.endif
210	.else
211		vmovdqa	3*16(p_keys), xkeyA
212	.endif
213
214	.set i, 0
215	.rept by
216		club XDATA, i
217		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
218		.set i, (i +1)
219	.endr
220
221	add	$(16*by), p_in
222
223	.if (klen == KEY_128)
224		vmovdqa	4*16(p_keys), xkeyB
225	.else
226		.if (load_keys)
227			vmovdqa	4*16(p_keys), xkey4
228		.endif
229	.endif
230
231	.set i, 0
232	.rept by
233		club XDATA, i
234		/* key 3 */
235		.if (klen == KEY_128)
236			vaesenc	xkey4, var_xdata, var_xdata
237		.else
238			vaesenc	xkeyA, var_xdata, var_xdata
239		.endif
240		.set i, (i +1)
241	.endr
242
243	vmovdqa	5*16(p_keys), xkeyA
244
245	.set i, 0
246	.rept by
247		club XDATA, i
248		/* key 4 */
249		.if (klen == KEY_128)
250			vaesenc	xkeyB, var_xdata, var_xdata
251		.else
252			vaesenc	xkey4, var_xdata, var_xdata
253		.endif
254		.set i, (i +1)
255	.endr
256
257	.if (klen == KEY_128)
258		.if (load_keys)
259			vmovdqa	6*16(p_keys), xkey8
260		.endif
261	.else
262		vmovdqa	6*16(p_keys), xkeyB
263	.endif
264
265	.set i, 0
266	.rept by
267		club XDATA, i
268		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
269		.set i, (i +1)
270	.endr
271
272	vmovdqa	7*16(p_keys), xkeyA
273
274	.set i, 0
275	.rept by
276		club XDATA, i
277		/* key 6 */
278		.if (klen == KEY_128)
279			vaesenc	xkey8, var_xdata, var_xdata
280		.else
281			vaesenc	xkeyB, var_xdata, var_xdata
282		.endif
283		.set i, (i +1)
284	.endr
285
286	.if (klen == KEY_128)
287		vmovdqa	8*16(p_keys), xkeyB
288	.else
289		.if (load_keys)
290			vmovdqa	8*16(p_keys), xkey8
291		.endif
292	.endif
293
294	.set i, 0
295	.rept by
296		club XDATA, i
297		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
298		.set i, (i +1)
299	.endr
300
301	.if (klen == KEY_128)
302		.if (load_keys)
303			vmovdqa	9*16(p_keys), xkey12
304		.endif
305	.else
306		vmovdqa	9*16(p_keys), xkeyA
307	.endif
308
309	.set i, 0
310	.rept by
311		club XDATA, i
312		/* key 8 */
313		.if (klen == KEY_128)
314			vaesenc	xkeyB, var_xdata, var_xdata
315		.else
316			vaesenc	xkey8, var_xdata, var_xdata
317		.endif
318		.set i, (i +1)
319	.endr
320
321	vmovdqa	10*16(p_keys), xkeyB
322
323	.set i, 0
324	.rept by
325		club XDATA, i
326		/* key 9 */
327		.if (klen == KEY_128)
328			vaesenc	xkey12, var_xdata, var_xdata
329		.else
330			vaesenc	xkeyA, var_xdata, var_xdata
331		.endif
332		.set i, (i +1)
333	.endr
334
335	.if (klen != KEY_128)
336		vmovdqa	11*16(p_keys), xkeyA
337	.endif
338
339	.set i, 0
340	.rept by
341		club XDATA, i
342		/* key 10 */
343		.if (klen == KEY_128)
344			vaesenclast	xkeyB, var_xdata, var_xdata
345		.else
346			vaesenc	xkeyB, var_xdata, var_xdata
347		.endif
348		.set i, (i +1)
349	.endr
350
351	.if (klen != KEY_128)
352		.if (load_keys)
353			vmovdqa	12*16(p_keys), xkey12
354		.endif
355
356		.set i, 0
357		.rept by
358			club XDATA, i
359			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
360			.set i, (i +1)
361		.endr
362
363		.if (klen == KEY_256)
364			vmovdqa	13*16(p_keys), xkeyA
365		.endif
366
367		.set i, 0
368		.rept by
369			club XDATA, i
370			.if (klen == KEY_256)
371				/* key 12 */
372				vaesenc	xkey12, var_xdata, var_xdata
373			.else
374				vaesenclast xkey12, var_xdata, var_xdata
375			.endif
376			.set i, (i +1)
377		.endr
378
379		.if (klen == KEY_256)
380			vmovdqa	14*16(p_keys), xkeyB
381
382			.set i, 0
383			.rept by
384				club XDATA, i
385				/* key 13 */
386				vaesenc	xkeyA, var_xdata, var_xdata
387				.set i, (i +1)
388			.endr
389
390			.set i, 0
391			.rept by
392				club XDATA, i
393				/* key 14 */
394				vaesenclast	xkeyB, var_xdata, var_xdata
395				.set i, (i +1)
396			.endr
397		.endif
398	.endif
399
400	.set i, 0
401	.rept (by / 2)
402		.set j, (i+1)
403		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
404		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
405		club XDATA, i
406		vpxor	xkeyA, var_xdata, var_xdata
407		club XDATA, j
408		vpxor	xkeyB, var_xdata, var_xdata
409		.set i, (i+2)
410	.endr
411
412	.if (i < by)
413		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
414		club XDATA, i
415		vpxor	xkeyA, var_xdata, var_xdata
416	.endif
417
418	.set i, 0
419	.rept by
420		club XDATA, i
421		VMOVDQ	var_xdata, i*16(p_out)
422		.set i, (i+1)
423	.endr
424.endm
425
426.macro do_aes_load val, key_len
427	do_aes \val, 1, \key_len
428.endm
429
430.macro do_aes_noload val, key_len
431	do_aes \val, 0, \key_len
432.endm
433
434/* main body of aes ctr load */
435
436.macro do_aes_ctrmain key_len
437	cmp	$16, num_bytes
438	jb	.Ldo_return2\key_len
439
440	vmovdqa	byteswap_const(%rip), xbyteswap
441	vmovdqu	(p_iv), xcounter
442	vpshufb	xbyteswap, xcounter, xcounter
443
444	mov	num_bytes, tmp
445	and	$(7*16), tmp
446	jz	.Lmult_of_8_blks\key_len
447
448	/* 1 <= tmp <= 7 */
449	cmp	$(4*16), tmp
450	jg	.Lgt4\key_len
451	je	.Leq4\key_len
452
453.Llt4\key_len:
454	cmp	$(2*16), tmp
455	jg	.Leq3\key_len
456	je	.Leq2\key_len
457
458.Leq1\key_len:
459	do_aes_load	1, \key_len
460	add	$(1*16), p_out
461	and	$(~7*16), num_bytes
462	jz	.Ldo_return2\key_len
463	jmp	.Lmain_loop2\key_len
464
465.Leq2\key_len:
466	do_aes_load	2, \key_len
467	add	$(2*16), p_out
468	and	$(~7*16), num_bytes
469	jz	.Ldo_return2\key_len
470	jmp	.Lmain_loop2\key_len
471
472
473.Leq3\key_len:
474	do_aes_load	3, \key_len
475	add	$(3*16), p_out
476	and	$(~7*16), num_bytes
477	jz	.Ldo_return2\key_len
478	jmp	.Lmain_loop2\key_len
479
480.Leq4\key_len:
481	do_aes_load	4, \key_len
482	add	$(4*16), p_out
483	and	$(~7*16), num_bytes
484	jz	.Ldo_return2\key_len
485	jmp	.Lmain_loop2\key_len
486
487.Lgt4\key_len:
488	cmp	$(6*16), tmp
489	jg	.Leq7\key_len
490	je	.Leq6\key_len
491
492.Leq5\key_len:
493	do_aes_load	5, \key_len
494	add	$(5*16), p_out
495	and	$(~7*16), num_bytes
496	jz	.Ldo_return2\key_len
497	jmp	.Lmain_loop2\key_len
498
499.Leq6\key_len:
500	do_aes_load	6, \key_len
501	add	$(6*16), p_out
502	and	$(~7*16), num_bytes
503	jz	.Ldo_return2\key_len
504	jmp	.Lmain_loop2\key_len
505
506.Leq7\key_len:
507	do_aes_load	7, \key_len
508	add	$(7*16), p_out
509	and	$(~7*16), num_bytes
510	jz	.Ldo_return2\key_len
511	jmp	.Lmain_loop2\key_len
512
513.Lmult_of_8_blks\key_len:
514	.if (\key_len != KEY_128)
515		vmovdqa	0*16(p_keys), xkey0
516		vmovdqa	4*16(p_keys), xkey4
517		vmovdqa	8*16(p_keys), xkey8
518		vmovdqa	12*16(p_keys), xkey12
519	.else
520		vmovdqa	0*16(p_keys), xkey0
521		vmovdqa	3*16(p_keys), xkey4
522		vmovdqa	6*16(p_keys), xkey8
523		vmovdqa	9*16(p_keys), xkey12
524	.endif
525.align 16
526.Lmain_loop2\key_len:
527	/* num_bytes is a multiple of 8 and >0 */
528	do_aes_noload	8, \key_len
529	add	$(8*16), p_out
530	sub	$(8*16), num_bytes
531	jne	.Lmain_loop2\key_len
532
533.Ldo_return2\key_len:
534	/* return updated IV */
535	vpshufb	xbyteswap, xcounter, xcounter
536	vmovdqu	xcounter, (p_iv)
537	ret
538.endm
539
540/*
541 * routine to do AES128 CTR enc/decrypt "by8"
542 * XMM registers are clobbered.
543 * Saving/restoring must be done at a higher level
544 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
545 *			unsigned int num_bytes)
546 */
547ENTRY(aes_ctr_enc_128_avx_by8)
548	/* call the aes main loop */
549	do_aes_ctrmain KEY_128
550
551ENDPROC(aes_ctr_enc_128_avx_by8)
552
553/*
554 * routine to do AES192 CTR enc/decrypt "by8"
555 * XMM registers are clobbered.
556 * Saving/restoring must be done at a higher level
557 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
558 *			unsigned int num_bytes)
559 */
560ENTRY(aes_ctr_enc_192_avx_by8)
561	/* call the aes main loop */
562	do_aes_ctrmain KEY_192
563
564ENDPROC(aes_ctr_enc_192_avx_by8)
565
566/*
567 * routine to do AES256 CTR enc/decrypt "by8"
568 * XMM registers are clobbered.
569 * Saving/restoring must be done at a higher level
570 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
571 *			unsigned int num_bytes)
572 */
573ENTRY(aes_ctr_enc_256_avx_by8)
574	/* call the aes main loop */
575	do_aes_ctrmain KEY_256
576
577ENDPROC(aes_ctr_enc_256_avx_by8)
578