1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b)	a##b
69#define VMOVDQ		vmovdqu
70
71#define xdata0		%xmm0
72#define xdata1		%xmm1
73#define xdata2		%xmm2
74#define xdata3		%xmm3
75#define xdata4		%xmm4
76#define xdata5		%xmm5
77#define xdata6		%xmm6
78#define xdata7		%xmm7
79#define xcounter	%xmm8
80#define xbyteswap	%xmm9
81#define xkey0		%xmm10
82#define xkey4		%xmm11
83#define xkey8		%xmm12
84#define xkey12		%xmm13
85#define xkeyA		%xmm14
86#define xkeyB		%xmm15
87
88#define p_in		%rdi
89#define p_iv		%rsi
90#define p_keys		%rdx
91#define p_out		%rcx
92#define num_bytes	%r8
93
94#define tmp		%r10
95#define	DDQ(i)		CONCAT(ddq_add_,i)
96#define	XMM(i)		CONCAT(%xmm, i)
97#define	DDQ_DATA	0
98#define	XDATA		1
99#define KEY_128		1
100#define KEY_192		2
101#define KEY_256		3
102
103.section .rodata
104.align 16
105
106byteswap_const:
107	.octa 0x000102030405060708090A0B0C0D0E0F
108ddq_low_msk:
109	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
110ddq_high_add_1:
111	.octa 0x00000000000000010000000000000000
112ddq_add_1:
113	.octa 0x00000000000000000000000000000001
114ddq_add_2:
115	.octa 0x00000000000000000000000000000002
116ddq_add_3:
117	.octa 0x00000000000000000000000000000003
118ddq_add_4:
119	.octa 0x00000000000000000000000000000004
120ddq_add_5:
121	.octa 0x00000000000000000000000000000005
122ddq_add_6:
123	.octa 0x00000000000000000000000000000006
124ddq_add_7:
125	.octa 0x00000000000000000000000000000007
126ddq_add_8:
127	.octa 0x00000000000000000000000000000008
128
129.text
130
131/* generate a unique variable for ddq_add_x */
132
133.macro setddq n
134	var_ddq_add = DDQ(\n)
135.endm
136
137/* generate a unique variable for xmm register */
138.macro setxdata n
139	var_xdata = XMM(\n)
140.endm
141
142/* club the numeric 'id' to the symbol 'name' */
143
144.macro club name, id
145.altmacro
146	.if \name == DDQ_DATA
147		setddq %\id
148	.elseif \name == XDATA
149		setxdata %\id
150	.endif
151.noaltmacro
152.endm
153
154/*
155 * do_aes num_in_par load_keys key_len
156 * This increments p_in, but not p_out
157 */
158.macro do_aes b, k, key_len
159	.set by, \b
160	.set load_keys, \k
161	.set klen, \key_len
162
163	.if (load_keys)
164		vmovdqa	0*16(p_keys), xkey0
165	.endif
166
167	vpshufb	xbyteswap, xcounter, xdata0
168
169	.set i, 1
170	.rept (by - 1)
171		club DDQ_DATA, i
172		club XDATA, i
173		vpaddq	var_ddq_add(%rip), xcounter, var_xdata
174		vptest	ddq_low_msk(%rip), var_xdata
175		jnz 1f
176		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
177		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
178		1:
179		vpshufb	xbyteswap, var_xdata, var_xdata
180		.set i, (i +1)
181	.endr
182
183	vmovdqa	1*16(p_keys), xkeyA
184
185	vpxor	xkey0, xdata0, xdata0
186	club DDQ_DATA, by
187	vpaddq	var_ddq_add(%rip), xcounter, xcounter
188	vptest	ddq_low_msk(%rip), xcounter
189	jnz	1f
190	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
191	1:
192
193	.set i, 1
194	.rept (by - 1)
195		club XDATA, i
196		vpxor	xkey0, var_xdata, var_xdata
197		.set i, (i +1)
198	.endr
199
200	vmovdqa	2*16(p_keys), xkeyB
201
202	.set i, 0
203	.rept by
204		club XDATA, i
205		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
206		.set i, (i +1)
207	.endr
208
209	.if (klen == KEY_128)
210		.if (load_keys)
211			vmovdqa	3*16(p_keys), xkey4
212		.endif
213	.else
214		vmovdqa	3*16(p_keys), xkeyA
215	.endif
216
217	.set i, 0
218	.rept by
219		club XDATA, i
220		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
221		.set i, (i +1)
222	.endr
223
224	add	$(16*by), p_in
225
226	.if (klen == KEY_128)
227		vmovdqa	4*16(p_keys), xkeyB
228	.else
229		.if (load_keys)
230			vmovdqa	4*16(p_keys), xkey4
231		.endif
232	.endif
233
234	.set i, 0
235	.rept by
236		club XDATA, i
237		/* key 3 */
238		.if (klen == KEY_128)
239			vaesenc	xkey4, var_xdata, var_xdata
240		.else
241			vaesenc	xkeyA, var_xdata, var_xdata
242		.endif
243		.set i, (i +1)
244	.endr
245
246	vmovdqa	5*16(p_keys), xkeyA
247
248	.set i, 0
249	.rept by
250		club XDATA, i
251		/* key 4 */
252		.if (klen == KEY_128)
253			vaesenc	xkeyB, var_xdata, var_xdata
254		.else
255			vaesenc	xkey4, var_xdata, var_xdata
256		.endif
257		.set i, (i +1)
258	.endr
259
260	.if (klen == KEY_128)
261		.if (load_keys)
262			vmovdqa	6*16(p_keys), xkey8
263		.endif
264	.else
265		vmovdqa	6*16(p_keys), xkeyB
266	.endif
267
268	.set i, 0
269	.rept by
270		club XDATA, i
271		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
272		.set i, (i +1)
273	.endr
274
275	vmovdqa	7*16(p_keys), xkeyA
276
277	.set i, 0
278	.rept by
279		club XDATA, i
280		/* key 6 */
281		.if (klen == KEY_128)
282			vaesenc	xkey8, var_xdata, var_xdata
283		.else
284			vaesenc	xkeyB, var_xdata, var_xdata
285		.endif
286		.set i, (i +1)
287	.endr
288
289	.if (klen == KEY_128)
290		vmovdqa	8*16(p_keys), xkeyB
291	.else
292		.if (load_keys)
293			vmovdqa	8*16(p_keys), xkey8
294		.endif
295	.endif
296
297	.set i, 0
298	.rept by
299		club XDATA, i
300		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
301		.set i, (i +1)
302	.endr
303
304	.if (klen == KEY_128)
305		.if (load_keys)
306			vmovdqa	9*16(p_keys), xkey12
307		.endif
308	.else
309		vmovdqa	9*16(p_keys), xkeyA
310	.endif
311
312	.set i, 0
313	.rept by
314		club XDATA, i
315		/* key 8 */
316		.if (klen == KEY_128)
317			vaesenc	xkeyB, var_xdata, var_xdata
318		.else
319			vaesenc	xkey8, var_xdata, var_xdata
320		.endif
321		.set i, (i +1)
322	.endr
323
324	vmovdqa	10*16(p_keys), xkeyB
325
326	.set i, 0
327	.rept by
328		club XDATA, i
329		/* key 9 */
330		.if (klen == KEY_128)
331			vaesenc	xkey12, var_xdata, var_xdata
332		.else
333			vaesenc	xkeyA, var_xdata, var_xdata
334		.endif
335		.set i, (i +1)
336	.endr
337
338	.if (klen != KEY_128)
339		vmovdqa	11*16(p_keys), xkeyA
340	.endif
341
342	.set i, 0
343	.rept by
344		club XDATA, i
345		/* key 10 */
346		.if (klen == KEY_128)
347			vaesenclast	xkeyB, var_xdata, var_xdata
348		.else
349			vaesenc	xkeyB, var_xdata, var_xdata
350		.endif
351		.set i, (i +1)
352	.endr
353
354	.if (klen != KEY_128)
355		.if (load_keys)
356			vmovdqa	12*16(p_keys), xkey12
357		.endif
358
359		.set i, 0
360		.rept by
361			club XDATA, i
362			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
363			.set i, (i +1)
364		.endr
365
366		.if (klen == KEY_256)
367			vmovdqa	13*16(p_keys), xkeyA
368		.endif
369
370		.set i, 0
371		.rept by
372			club XDATA, i
373			.if (klen == KEY_256)
374				/* key 12 */
375				vaesenc	xkey12, var_xdata, var_xdata
376			.else
377				vaesenclast xkey12, var_xdata, var_xdata
378			.endif
379			.set i, (i +1)
380		.endr
381
382		.if (klen == KEY_256)
383			vmovdqa	14*16(p_keys), xkeyB
384
385			.set i, 0
386			.rept by
387				club XDATA, i
388				/* key 13 */
389				vaesenc	xkeyA, var_xdata, var_xdata
390				.set i, (i +1)
391			.endr
392
393			.set i, 0
394			.rept by
395				club XDATA, i
396				/* key 14 */
397				vaesenclast	xkeyB, var_xdata, var_xdata
398				.set i, (i +1)
399			.endr
400		.endif
401	.endif
402
403	.set i, 0
404	.rept (by / 2)
405		.set j, (i+1)
406		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
407		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
408		club XDATA, i
409		vpxor	xkeyA, var_xdata, var_xdata
410		club XDATA, j
411		vpxor	xkeyB, var_xdata, var_xdata
412		.set i, (i+2)
413	.endr
414
415	.if (i < by)
416		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
417		club XDATA, i
418		vpxor	xkeyA, var_xdata, var_xdata
419	.endif
420
421	.set i, 0
422	.rept by
423		club XDATA, i
424		VMOVDQ	var_xdata, i*16(p_out)
425		.set i, (i+1)
426	.endr
427.endm
428
429.macro do_aes_load val, key_len
430	do_aes \val, 1, \key_len
431.endm
432
433.macro do_aes_noload val, key_len
434	do_aes \val, 0, \key_len
435.endm
436
437/* main body of aes ctr load */
438
439.macro do_aes_ctrmain key_len
440	cmp	$16, num_bytes
441	jb	.Ldo_return2\key_len
442
443	vmovdqa	byteswap_const(%rip), xbyteswap
444	vmovdqu	(p_iv), xcounter
445	vpshufb	xbyteswap, xcounter, xcounter
446
447	mov	num_bytes, tmp
448	and	$(7*16), tmp
449	jz	.Lmult_of_8_blks\key_len
450
451	/* 1 <= tmp <= 7 */
452	cmp	$(4*16), tmp
453	jg	.Lgt4\key_len
454	je	.Leq4\key_len
455
456.Llt4\key_len:
457	cmp	$(2*16), tmp
458	jg	.Leq3\key_len
459	je	.Leq2\key_len
460
461.Leq1\key_len:
462	do_aes_load	1, \key_len
463	add	$(1*16), p_out
464	and	$(~7*16), num_bytes
465	jz	.Ldo_return2\key_len
466	jmp	.Lmain_loop2\key_len
467
468.Leq2\key_len:
469	do_aes_load	2, \key_len
470	add	$(2*16), p_out
471	and	$(~7*16), num_bytes
472	jz	.Ldo_return2\key_len
473	jmp	.Lmain_loop2\key_len
474
475
476.Leq3\key_len:
477	do_aes_load	3, \key_len
478	add	$(3*16), p_out
479	and	$(~7*16), num_bytes
480	jz	.Ldo_return2\key_len
481	jmp	.Lmain_loop2\key_len
482
483.Leq4\key_len:
484	do_aes_load	4, \key_len
485	add	$(4*16), p_out
486	and	$(~7*16), num_bytes
487	jz	.Ldo_return2\key_len
488	jmp	.Lmain_loop2\key_len
489
490.Lgt4\key_len:
491	cmp	$(6*16), tmp
492	jg	.Leq7\key_len
493	je	.Leq6\key_len
494
495.Leq5\key_len:
496	do_aes_load	5, \key_len
497	add	$(5*16), p_out
498	and	$(~7*16), num_bytes
499	jz	.Ldo_return2\key_len
500	jmp	.Lmain_loop2\key_len
501
502.Leq6\key_len:
503	do_aes_load	6, \key_len
504	add	$(6*16), p_out
505	and	$(~7*16), num_bytes
506	jz	.Ldo_return2\key_len
507	jmp	.Lmain_loop2\key_len
508
509.Leq7\key_len:
510	do_aes_load	7, \key_len
511	add	$(7*16), p_out
512	and	$(~7*16), num_bytes
513	jz	.Ldo_return2\key_len
514	jmp	.Lmain_loop2\key_len
515
516.Lmult_of_8_blks\key_len:
517	.if (\key_len != KEY_128)
518		vmovdqa	0*16(p_keys), xkey0
519		vmovdqa	4*16(p_keys), xkey4
520		vmovdqa	8*16(p_keys), xkey8
521		vmovdqa	12*16(p_keys), xkey12
522	.else
523		vmovdqa	0*16(p_keys), xkey0
524		vmovdqa	3*16(p_keys), xkey4
525		vmovdqa	6*16(p_keys), xkey8
526		vmovdqa	9*16(p_keys), xkey12
527	.endif
528.align 16
529.Lmain_loop2\key_len:
530	/* num_bytes is a multiple of 8 and >0 */
531	do_aes_noload	8, \key_len
532	add	$(8*16), p_out
533	sub	$(8*16), num_bytes
534	jne	.Lmain_loop2\key_len
535
536.Ldo_return2\key_len:
537	/* return updated IV */
538	vpshufb	xbyteswap, xcounter, xcounter
539	vmovdqu	xcounter, (p_iv)
540	ret
541.endm
542
543/*
544 * routine to do AES128 CTR enc/decrypt "by8"
545 * XMM registers are clobbered.
546 * Saving/restoring must be done at a higher level
547 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
548 *			unsigned int num_bytes)
549 */
550ENTRY(aes_ctr_enc_128_avx_by8)
551	/* call the aes main loop */
552	do_aes_ctrmain KEY_128
553
554ENDPROC(aes_ctr_enc_128_avx_by8)
555
556/*
557 * routine to do AES192 CTR enc/decrypt "by8"
558 * XMM registers are clobbered.
559 * Saving/restoring must be done at a higher level
560 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
561 *			unsigned int num_bytes)
562 */
563ENTRY(aes_ctr_enc_192_avx_by8)
564	/* call the aes main loop */
565	do_aes_ctrmain KEY_192
566
567ENDPROC(aes_ctr_enc_192_avx_by8)
568
569/*
570 * routine to do AES256 CTR enc/decrypt "by8"
571 * XMM registers are clobbered.
572 * Saving/restoring must be done at a higher level
573 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
574 *			unsigned int num_bytes)
575 */
576ENTRY(aes_ctr_enc_256_avx_by8)
577	/* call the aes main loop */
578	do_aes_ctrmain KEY_256
579
580ENDPROC(aes_ctr_enc_256_avx_by8)
581