1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66#include <asm/inst.h>
67
68#define CONCAT(a,b)	a##b
69#define VMOVDQ		vmovdqu
70
71#define xdata0		%xmm0
72#define xdata1		%xmm1
73#define xdata2		%xmm2
74#define xdata3		%xmm3
75#define xdata4		%xmm4
76#define xdata5		%xmm5
77#define xdata6		%xmm6
78#define xdata7		%xmm7
79#define xcounter	%xmm8
80#define xbyteswap	%xmm9
81#define xkey0		%xmm10
82#define xkey4		%xmm11
83#define xkey8		%xmm12
84#define xkey12		%xmm13
85#define xkeyA		%xmm14
86#define xkeyB		%xmm15
87
88#define p_in		%rdi
89#define p_iv		%rsi
90#define p_keys		%rdx
91#define p_out		%rcx
92#define num_bytes	%r8
93
94#define tmp		%r10
95#define	DDQ(i)		CONCAT(ddq_add_,i)
96#define	XMM(i)		CONCAT(%xmm, i)
97#define	DDQ_DATA	0
98#define	XDATA		1
99#define KEY_128		1
100#define KEY_192		2
101#define KEY_256		3
102
103.section .rodata
104.align 16
105
106byteswap_const:
107	.octa 0x000102030405060708090A0B0C0D0E0F
108ddq_low_msk:
109	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
110ddq_high_add_1:
111	.octa 0x00000000000000010000000000000000
112ddq_add_1:
113	.octa 0x00000000000000000000000000000001
114ddq_add_2:
115	.octa 0x00000000000000000000000000000002
116ddq_add_3:
117	.octa 0x00000000000000000000000000000003
118ddq_add_4:
119	.octa 0x00000000000000000000000000000004
120ddq_add_5:
121	.octa 0x00000000000000000000000000000005
122ddq_add_6:
123	.octa 0x00000000000000000000000000000006
124ddq_add_7:
125	.octa 0x00000000000000000000000000000007
126ddq_add_8:
127	.octa 0x00000000000000000000000000000008
128
129.text
130
131/* generate a unique variable for ddq_add_x */
132
133.macro setddq n
134	var_ddq_add = DDQ(\n)
135.endm
136
137/* generate a unique variable for xmm register */
138.macro setxdata n
139	var_xdata = XMM(\n)
140.endm
141
142/* club the numeric 'id' to the symbol 'name' */
143
144.macro club name, id
145.altmacro
146	.if \name == DDQ_DATA
147		setddq %\id
148	.elseif \name == XDATA
149		setxdata %\id
150	.endif
151.noaltmacro
152.endm
153
154/*
155 * do_aes num_in_par load_keys key_len
156 * This increments p_in, but not p_out
157 */
158.macro do_aes b, k, key_len
159	.set by, \b
160	.set load_keys, \k
161	.set klen, \key_len
162
163	.if (load_keys)
164		vmovdqa	0*16(p_keys), xkey0
165	.endif
166
167	vpshufb	xbyteswap, xcounter, xdata0
168
169	.set i, 1
170	.rept (by - 1)
171		club DDQ_DATA, i
172		club XDATA, i
173		vpaddq	var_ddq_add(%rip), xcounter, var_xdata
174		vptest	ddq_low_msk(%rip), var_xdata
175		jnz 1f
176		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
177		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
178		1:
179		vpshufb	xbyteswap, var_xdata, var_xdata
180		.set i, (i +1)
181	.endr
182
183	vmovdqa	1*16(p_keys), xkeyA
184
185	vpxor	xkey0, xdata0, xdata0
186	club DDQ_DATA, by
187	vpaddq	var_ddq_add(%rip), xcounter, xcounter
188	vptest	ddq_low_msk(%rip), xcounter
189	jnz	1f
190	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
191	1:
192
193	.set i, 1
194	.rept (by - 1)
195		club XDATA, i
196		vpxor	xkey0, var_xdata, var_xdata
197		.set i, (i +1)
198	.endr
199
200	vmovdqa	2*16(p_keys), xkeyB
201
202	.set i, 0
203	.rept by
204		club XDATA, i
205		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
206		.set i, (i +1)
207	.endr
208
209	.if (klen == KEY_128)
210		.if (load_keys)
211			vmovdqa	3*16(p_keys), xkeyA
212		.endif
213	.else
214		vmovdqa	3*16(p_keys), xkeyA
215	.endif
216
217	.set i, 0
218	.rept by
219		club XDATA, i
220		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
221		.set i, (i +1)
222	.endr
223
224	add	$(16*by), p_in
225
226	.if (klen == KEY_128)
227		vmovdqa	4*16(p_keys), xkey4
228	.else
229		.if (load_keys)
230			vmovdqa	4*16(p_keys), xkey4
231		.endif
232	.endif
233
234	.set i, 0
235	.rept by
236		club XDATA, i
237		vaesenc	xkeyA, var_xdata, var_xdata		/* key 3 */
238		.set i, (i +1)
239	.endr
240
241	vmovdqa	5*16(p_keys), xkeyA
242
243	.set i, 0
244	.rept by
245		club XDATA, i
246		vaesenc	xkey4, var_xdata, var_xdata		/* key 4 */
247		.set i, (i +1)
248	.endr
249
250	.if (klen == KEY_128)
251		.if (load_keys)
252			vmovdqa	6*16(p_keys), xkeyB
253		.endif
254	.else
255		vmovdqa	6*16(p_keys), xkeyB
256	.endif
257
258	.set i, 0
259	.rept by
260		club XDATA, i
261		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
262		.set i, (i +1)
263	.endr
264
265	vmovdqa	7*16(p_keys), xkeyA
266
267	.set i, 0
268	.rept by
269		club XDATA, i
270		vaesenc	xkeyB, var_xdata, var_xdata		/* key 6 */
271		.set i, (i +1)
272	.endr
273
274	.if (klen == KEY_128)
275		vmovdqa	8*16(p_keys), xkey8
276	.else
277		.if (load_keys)
278			vmovdqa	8*16(p_keys), xkey8
279		.endif
280	.endif
281
282	.set i, 0
283	.rept by
284		club XDATA, i
285		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
286		.set i, (i +1)
287	.endr
288
289	.if (klen == KEY_128)
290		.if (load_keys)
291			vmovdqa	9*16(p_keys), xkeyA
292		.endif
293	.else
294		vmovdqa	9*16(p_keys), xkeyA
295	.endif
296
297	.set i, 0
298	.rept by
299		club XDATA, i
300		vaesenc	xkey8, var_xdata, var_xdata		/* key 8 */
301		.set i, (i +1)
302	.endr
303
304	vmovdqa	10*16(p_keys), xkeyB
305
306	.set i, 0
307	.rept by
308		club XDATA, i
309		vaesenc	xkeyA, var_xdata, var_xdata		/* key 9 */
310		.set i, (i +1)
311	.endr
312
313	.if (klen != KEY_128)
314		vmovdqa	11*16(p_keys), xkeyA
315	.endif
316
317	.set i, 0
318	.rept by
319		club XDATA, i
320		/* key 10 */
321		.if (klen == KEY_128)
322			vaesenclast	xkeyB, var_xdata, var_xdata
323		.else
324			vaesenc	xkeyB, var_xdata, var_xdata
325		.endif
326		.set i, (i +1)
327	.endr
328
329	.if (klen != KEY_128)
330		.if (load_keys)
331			vmovdqa	12*16(p_keys), xkey12
332		.endif
333
334		.set i, 0
335		.rept by
336			club XDATA, i
337			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
338			.set i, (i +1)
339		.endr
340
341		.if (klen == KEY_256)
342			vmovdqa	13*16(p_keys), xkeyA
343		.endif
344
345		.set i, 0
346		.rept by
347			club XDATA, i
348			.if (klen == KEY_256)
349				/* key 12 */
350				vaesenc	xkey12, var_xdata, var_xdata
351			.else
352				vaesenclast xkey12, var_xdata, var_xdata
353			.endif
354			.set i, (i +1)
355		.endr
356
357		.if (klen == KEY_256)
358			vmovdqa	14*16(p_keys), xkeyB
359
360			.set i, 0
361			.rept by
362				club XDATA, i
363				/* key 13 */
364				vaesenc	xkeyA, var_xdata, var_xdata
365				.set i, (i +1)
366			.endr
367
368			.set i, 0
369			.rept by
370				club XDATA, i
371				/* key 14 */
372				vaesenclast	xkeyB, var_xdata, var_xdata
373				.set i, (i +1)
374			.endr
375		.endif
376	.endif
377
378	.set i, 0
379	.rept (by / 2)
380		.set j, (i+1)
381		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
382		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
383		club XDATA, i
384		vpxor	xkeyA, var_xdata, var_xdata
385		club XDATA, j
386		vpxor	xkeyB, var_xdata, var_xdata
387		.set i, (i+2)
388	.endr
389
390	.if (i < by)
391		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
392		club XDATA, i
393		vpxor	xkeyA, var_xdata, var_xdata
394	.endif
395
396	.set i, 0
397	.rept by
398		club XDATA, i
399		VMOVDQ	var_xdata, i*16(p_out)
400		.set i, (i+1)
401	.endr
402.endm
403
404.macro do_aes_load val, key_len
405	do_aes \val, 1, \key_len
406.endm
407
408.macro do_aes_noload val, key_len
409	do_aes \val, 0, \key_len
410.endm
411
412/* main body of aes ctr load */
413
414.macro do_aes_ctrmain key_len
415
416	cmp	$16, num_bytes
417	jb	.Ldo_return2\key_len
418
419	vmovdqa	byteswap_const(%rip), xbyteswap
420	vmovdqu	(p_iv), xcounter
421	vpshufb	xbyteswap, xcounter, xcounter
422
423	mov	num_bytes, tmp
424	and	$(7*16), tmp
425	jz	.Lmult_of_8_blks\key_len
426
427	/* 1 <= tmp <= 7 */
428	cmp	$(4*16), tmp
429	jg	.Lgt4\key_len
430	je	.Leq4\key_len
431
432.Llt4\key_len:
433	cmp	$(2*16), tmp
434	jg	.Leq3\key_len
435	je	.Leq2\key_len
436
437.Leq1\key_len:
438	do_aes_load	1, \key_len
439	add	$(1*16), p_out
440	and	$(~7*16), num_bytes
441	jz	.Ldo_return2\key_len
442	jmp	.Lmain_loop2\key_len
443
444.Leq2\key_len:
445	do_aes_load	2, \key_len
446	add	$(2*16), p_out
447	and	$(~7*16), num_bytes
448	jz	.Ldo_return2\key_len
449	jmp	.Lmain_loop2\key_len
450
451
452.Leq3\key_len:
453	do_aes_load	3, \key_len
454	add	$(3*16), p_out
455	and	$(~7*16), num_bytes
456	jz	.Ldo_return2\key_len
457	jmp	.Lmain_loop2\key_len
458
459.Leq4\key_len:
460	do_aes_load	4, \key_len
461	add	$(4*16), p_out
462	and	$(~7*16), num_bytes
463	jz	.Ldo_return2\key_len
464	jmp	.Lmain_loop2\key_len
465
466.Lgt4\key_len:
467	cmp	$(6*16), tmp
468	jg	.Leq7\key_len
469	je	.Leq6\key_len
470
471.Leq5\key_len:
472	do_aes_load	5, \key_len
473	add	$(5*16), p_out
474	and	$(~7*16), num_bytes
475	jz	.Ldo_return2\key_len
476	jmp	.Lmain_loop2\key_len
477
478.Leq6\key_len:
479	do_aes_load	6, \key_len
480	add	$(6*16), p_out
481	and	$(~7*16), num_bytes
482	jz	.Ldo_return2\key_len
483	jmp	.Lmain_loop2\key_len
484
485.Leq7\key_len:
486	do_aes_load	7, \key_len
487	add	$(7*16), p_out
488	and	$(~7*16), num_bytes
489	jz	.Ldo_return2\key_len
490	jmp	.Lmain_loop2\key_len
491
492.Lmult_of_8_blks\key_len:
493	.if (\key_len != KEY_128)
494		vmovdqa	0*16(p_keys), xkey0
495		vmovdqa	4*16(p_keys), xkey4
496		vmovdqa	8*16(p_keys), xkey8
497		vmovdqa	12*16(p_keys), xkey12
498	.else
499		vmovdqa	0*16(p_keys), xkey0
500		vmovdqa	3*16(p_keys), xkey4
501		vmovdqa	6*16(p_keys), xkey8
502		vmovdqa	9*16(p_keys), xkey12
503	.endif
504.align 16
505.Lmain_loop2\key_len:
506	/* num_bytes is a multiple of 8 and >0 */
507	do_aes_noload	8, \key_len
508	add	$(8*16), p_out
509	sub	$(8*16), num_bytes
510	jne	.Lmain_loop2\key_len
511
512.Ldo_return2\key_len:
513	/* return updated IV */
514	vpshufb	xbyteswap, xcounter, xcounter
515	vmovdqu	xcounter, (p_iv)
516	ret
517.endm
518
519/*
520 * routine to do AES128 CTR enc/decrypt "by8"
521 * XMM registers are clobbered.
522 * Saving/restoring must be done at a higher level
523 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
524 *			unsigned int num_bytes)
525 */
526ENTRY(aes_ctr_enc_128_avx_by8)
527	/* call the aes main loop */
528	do_aes_ctrmain KEY_128
529
530ENDPROC(aes_ctr_enc_128_avx_by8)
531
532/*
533 * routine to do AES192 CTR enc/decrypt "by8"
534 * XMM registers are clobbered.
535 * Saving/restoring must be done at a higher level
536 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
537 *			unsigned int num_bytes)
538 */
539ENTRY(aes_ctr_enc_192_avx_by8)
540	/* call the aes main loop */
541	do_aes_ctrmain KEY_192
542
543ENDPROC(aes_ctr_enc_192_avx_by8)
544
545/*
546 * routine to do AES256 CTR enc/decrypt "by8"
547 * XMM registers are clobbered.
548 * Saving/restoring must be done at a higher level
549 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
550 *			unsigned int num_bytes)
551 */
552ENTRY(aes_ctr_enc_256_avx_by8)
553	/* call the aes main loop */
554	do_aes_ctrmain KEY_256
555
556ENDPROC(aes_ctr_enc_256_avx_by8)
557