1/*
2 *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3 *
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
6 *
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11 *
12 * This file is provided under a dual BSD/GPLv2 license.  When using or
13 * redistributing this file, you may do so under either license.
14 *
15 * GPL LICENSE SUMMARY
16 *
17 * Copyright(c) 2014 Intel Corporation.
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
22 *
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26 * General Public License for more details.
27 *
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
32 *
33 * BSD LICENSE
34 *
35 * Copyright(c) 2014 Intel Corporation.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 *
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
46 * distribution.
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64
65#include <linux/linkage.h>
66
67#define VMOVDQ		vmovdqu
68
69#define xdata0		%xmm0
70#define xdata1		%xmm1
71#define xdata2		%xmm2
72#define xdata3		%xmm3
73#define xdata4		%xmm4
74#define xdata5		%xmm5
75#define xdata6		%xmm6
76#define xdata7		%xmm7
77#define xcounter	%xmm8
78#define xbyteswap	%xmm9
79#define xkey0		%xmm10
80#define xkey4		%xmm11
81#define xkey8		%xmm12
82#define xkey12		%xmm13
83#define xkeyA		%xmm14
84#define xkeyB		%xmm15
85
86#define p_in		%rdi
87#define p_iv		%rsi
88#define p_keys		%rdx
89#define p_out		%rcx
90#define num_bytes	%r8
91
92#define tmp		%r10
93#define	DDQ_DATA	0
94#define	XDATA		1
95#define KEY_128		1
96#define KEY_192		2
97#define KEY_256		3
98
99.section .rodata
100.align 16
101
102byteswap_const:
103	.octa 0x000102030405060708090A0B0C0D0E0F
104ddq_low_msk:
105	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
106ddq_high_add_1:
107	.octa 0x00000000000000010000000000000000
108ddq_add_1:
109	.octa 0x00000000000000000000000000000001
110ddq_add_2:
111	.octa 0x00000000000000000000000000000002
112ddq_add_3:
113	.octa 0x00000000000000000000000000000003
114ddq_add_4:
115	.octa 0x00000000000000000000000000000004
116ddq_add_5:
117	.octa 0x00000000000000000000000000000005
118ddq_add_6:
119	.octa 0x00000000000000000000000000000006
120ddq_add_7:
121	.octa 0x00000000000000000000000000000007
122ddq_add_8:
123	.octa 0x00000000000000000000000000000008
124
125.text
126
127/* generate a unique variable for ddq_add_x */
128
129/* generate a unique variable for xmm register */
130.macro setxdata n
131	var_xdata = %xmm\n
132.endm
133
134/* club the numeric 'id' to the symbol 'name' */
135
136.macro club name, id
137.altmacro
138	.if \name == XDATA
139		setxdata %\id
140	.endif
141.noaltmacro
142.endm
143
144/*
145 * do_aes num_in_par load_keys key_len
146 * This increments p_in, but not p_out
147 */
148.macro do_aes b, k, key_len
149	.set by, \b
150	.set load_keys, \k
151	.set klen, \key_len
152
153	.if (load_keys)
154		vmovdqa	0*16(p_keys), xkey0
155	.endif
156
157	vpshufb	xbyteswap, xcounter, xdata0
158
159	.set i, 1
160	.rept (by - 1)
161		club XDATA, i
162		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
163		vptest	ddq_low_msk(%rip), var_xdata
164		jnz 1f
165		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
166		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
167		1:
168		vpshufb	xbyteswap, var_xdata, var_xdata
169		.set i, (i +1)
170	.endr
171
172	vmovdqa	1*16(p_keys), xkeyA
173
174	vpxor	xkey0, xdata0, xdata0
175	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
176	vptest	ddq_low_msk(%rip), xcounter
177	jnz	1f
178	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
179	1:
180
181	.set i, 1
182	.rept (by - 1)
183		club XDATA, i
184		vpxor	xkey0, var_xdata, var_xdata
185		.set i, (i +1)
186	.endr
187
188	vmovdqa	2*16(p_keys), xkeyB
189
190	.set i, 0
191	.rept by
192		club XDATA, i
193		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
194		.set i, (i +1)
195	.endr
196
197	.if (klen == KEY_128)
198		.if (load_keys)
199			vmovdqa	3*16(p_keys), xkey4
200		.endif
201	.else
202		vmovdqa	3*16(p_keys), xkeyA
203	.endif
204
205	.set i, 0
206	.rept by
207		club XDATA, i
208		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
209		.set i, (i +1)
210	.endr
211
212	add	$(16*by), p_in
213
214	.if (klen == KEY_128)
215		vmovdqa	4*16(p_keys), xkeyB
216	.else
217		.if (load_keys)
218			vmovdqa	4*16(p_keys), xkey4
219		.endif
220	.endif
221
222	.set i, 0
223	.rept by
224		club XDATA, i
225		/* key 3 */
226		.if (klen == KEY_128)
227			vaesenc	xkey4, var_xdata, var_xdata
228		.else
229			vaesenc	xkeyA, var_xdata, var_xdata
230		.endif
231		.set i, (i +1)
232	.endr
233
234	vmovdqa	5*16(p_keys), xkeyA
235
236	.set i, 0
237	.rept by
238		club XDATA, i
239		/* key 4 */
240		.if (klen == KEY_128)
241			vaesenc	xkeyB, var_xdata, var_xdata
242		.else
243			vaesenc	xkey4, var_xdata, var_xdata
244		.endif
245		.set i, (i +1)
246	.endr
247
248	.if (klen == KEY_128)
249		.if (load_keys)
250			vmovdqa	6*16(p_keys), xkey8
251		.endif
252	.else
253		vmovdqa	6*16(p_keys), xkeyB
254	.endif
255
256	.set i, 0
257	.rept by
258		club XDATA, i
259		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
260		.set i, (i +1)
261	.endr
262
263	vmovdqa	7*16(p_keys), xkeyA
264
265	.set i, 0
266	.rept by
267		club XDATA, i
268		/* key 6 */
269		.if (klen == KEY_128)
270			vaesenc	xkey8, var_xdata, var_xdata
271		.else
272			vaesenc	xkeyB, var_xdata, var_xdata
273		.endif
274		.set i, (i +1)
275	.endr
276
277	.if (klen == KEY_128)
278		vmovdqa	8*16(p_keys), xkeyB
279	.else
280		.if (load_keys)
281			vmovdqa	8*16(p_keys), xkey8
282		.endif
283	.endif
284
285	.set i, 0
286	.rept by
287		club XDATA, i
288		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
289		.set i, (i +1)
290	.endr
291
292	.if (klen == KEY_128)
293		.if (load_keys)
294			vmovdqa	9*16(p_keys), xkey12
295		.endif
296	.else
297		vmovdqa	9*16(p_keys), xkeyA
298	.endif
299
300	.set i, 0
301	.rept by
302		club XDATA, i
303		/* key 8 */
304		.if (klen == KEY_128)
305			vaesenc	xkeyB, var_xdata, var_xdata
306		.else
307			vaesenc	xkey8, var_xdata, var_xdata
308		.endif
309		.set i, (i +1)
310	.endr
311
312	vmovdqa	10*16(p_keys), xkeyB
313
314	.set i, 0
315	.rept by
316		club XDATA, i
317		/* key 9 */
318		.if (klen == KEY_128)
319			vaesenc	xkey12, var_xdata, var_xdata
320		.else
321			vaesenc	xkeyA, var_xdata, var_xdata
322		.endif
323		.set i, (i +1)
324	.endr
325
326	.if (klen != KEY_128)
327		vmovdqa	11*16(p_keys), xkeyA
328	.endif
329
330	.set i, 0
331	.rept by
332		club XDATA, i
333		/* key 10 */
334		.if (klen == KEY_128)
335			vaesenclast	xkeyB, var_xdata, var_xdata
336		.else
337			vaesenc	xkeyB, var_xdata, var_xdata
338		.endif
339		.set i, (i +1)
340	.endr
341
342	.if (klen != KEY_128)
343		.if (load_keys)
344			vmovdqa	12*16(p_keys), xkey12
345		.endif
346
347		.set i, 0
348		.rept by
349			club XDATA, i
350			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
351			.set i, (i +1)
352		.endr
353
354		.if (klen == KEY_256)
355			vmovdqa	13*16(p_keys), xkeyA
356		.endif
357
358		.set i, 0
359		.rept by
360			club XDATA, i
361			.if (klen == KEY_256)
362				/* key 12 */
363				vaesenc	xkey12, var_xdata, var_xdata
364			.else
365				vaesenclast xkey12, var_xdata, var_xdata
366			.endif
367			.set i, (i +1)
368		.endr
369
370		.if (klen == KEY_256)
371			vmovdqa	14*16(p_keys), xkeyB
372
373			.set i, 0
374			.rept by
375				club XDATA, i
376				/* key 13 */
377				vaesenc	xkeyA, var_xdata, var_xdata
378				.set i, (i +1)
379			.endr
380
381			.set i, 0
382			.rept by
383				club XDATA, i
384				/* key 14 */
385				vaesenclast	xkeyB, var_xdata, var_xdata
386				.set i, (i +1)
387			.endr
388		.endif
389	.endif
390
391	.set i, 0
392	.rept (by / 2)
393		.set j, (i+1)
394		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
395		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
396		club XDATA, i
397		vpxor	xkeyA, var_xdata, var_xdata
398		club XDATA, j
399		vpxor	xkeyB, var_xdata, var_xdata
400		.set i, (i+2)
401	.endr
402
403	.if (i < by)
404		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
405		club XDATA, i
406		vpxor	xkeyA, var_xdata, var_xdata
407	.endif
408
409	.set i, 0
410	.rept by
411		club XDATA, i
412		VMOVDQ	var_xdata, i*16(p_out)
413		.set i, (i+1)
414	.endr
415.endm
416
417.macro do_aes_load val, key_len
418	do_aes \val, 1, \key_len
419.endm
420
421.macro do_aes_noload val, key_len
422	do_aes \val, 0, \key_len
423.endm
424
425/* main body of aes ctr load */
426
427.macro do_aes_ctrmain key_len
428	cmp	$16, num_bytes
429	jb	.Ldo_return2\key_len
430
431	vmovdqa	byteswap_const(%rip), xbyteswap
432	vmovdqu	(p_iv), xcounter
433	vpshufb	xbyteswap, xcounter, xcounter
434
435	mov	num_bytes, tmp
436	and	$(7*16), tmp
437	jz	.Lmult_of_8_blks\key_len
438
439	/* 1 <= tmp <= 7 */
440	cmp	$(4*16), tmp
441	jg	.Lgt4\key_len
442	je	.Leq4\key_len
443
444.Llt4\key_len:
445	cmp	$(2*16), tmp
446	jg	.Leq3\key_len
447	je	.Leq2\key_len
448
449.Leq1\key_len:
450	do_aes_load	1, \key_len
451	add	$(1*16), p_out
452	and	$(~7*16), num_bytes
453	jz	.Ldo_return2\key_len
454	jmp	.Lmain_loop2\key_len
455
456.Leq2\key_len:
457	do_aes_load	2, \key_len
458	add	$(2*16), p_out
459	and	$(~7*16), num_bytes
460	jz	.Ldo_return2\key_len
461	jmp	.Lmain_loop2\key_len
462
463
464.Leq3\key_len:
465	do_aes_load	3, \key_len
466	add	$(3*16), p_out
467	and	$(~7*16), num_bytes
468	jz	.Ldo_return2\key_len
469	jmp	.Lmain_loop2\key_len
470
471.Leq4\key_len:
472	do_aes_load	4, \key_len
473	add	$(4*16), p_out
474	and	$(~7*16), num_bytes
475	jz	.Ldo_return2\key_len
476	jmp	.Lmain_loop2\key_len
477
478.Lgt4\key_len:
479	cmp	$(6*16), tmp
480	jg	.Leq7\key_len
481	je	.Leq6\key_len
482
483.Leq5\key_len:
484	do_aes_load	5, \key_len
485	add	$(5*16), p_out
486	and	$(~7*16), num_bytes
487	jz	.Ldo_return2\key_len
488	jmp	.Lmain_loop2\key_len
489
490.Leq6\key_len:
491	do_aes_load	6, \key_len
492	add	$(6*16), p_out
493	and	$(~7*16), num_bytes
494	jz	.Ldo_return2\key_len
495	jmp	.Lmain_loop2\key_len
496
497.Leq7\key_len:
498	do_aes_load	7, \key_len
499	add	$(7*16), p_out
500	and	$(~7*16), num_bytes
501	jz	.Ldo_return2\key_len
502	jmp	.Lmain_loop2\key_len
503
504.Lmult_of_8_blks\key_len:
505	.if (\key_len != KEY_128)
506		vmovdqa	0*16(p_keys), xkey0
507		vmovdqa	4*16(p_keys), xkey4
508		vmovdqa	8*16(p_keys), xkey8
509		vmovdqa	12*16(p_keys), xkey12
510	.else
511		vmovdqa	0*16(p_keys), xkey0
512		vmovdqa	3*16(p_keys), xkey4
513		vmovdqa	6*16(p_keys), xkey8
514		vmovdqa	9*16(p_keys), xkey12
515	.endif
516.align 16
517.Lmain_loop2\key_len:
518	/* num_bytes is a multiple of 8 and >0 */
519	do_aes_noload	8, \key_len
520	add	$(8*16), p_out
521	sub	$(8*16), num_bytes
522	jne	.Lmain_loop2\key_len
523
524.Ldo_return2\key_len:
525	/* return updated IV */
526	vpshufb	xbyteswap, xcounter, xcounter
527	vmovdqu	xcounter, (p_iv)
528	RET
529.endm
530
531/*
532 * routine to do AES128 CTR enc/decrypt "by8"
533 * XMM registers are clobbered.
534 * Saving/restoring must be done at a higher level
535 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
536 *			unsigned int num_bytes)
537 */
538SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
539	/* call the aes main loop */
540	do_aes_ctrmain KEY_128
541
542SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
543
544/*
545 * routine to do AES192 CTR enc/decrypt "by8"
546 * XMM registers are clobbered.
547 * Saving/restoring must be done at a higher level
548 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
549 *			unsigned int num_bytes)
550 */
551SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
552	/* call the aes main loop */
553	do_aes_ctrmain KEY_192
554
555SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
556
557/*
558 * routine to do AES256 CTR enc/decrypt "by8"
559 * XMM registers are clobbered.
560 * Saving/restoring must be done at a higher level
561 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
562 *			unsigned int num_bytes)
563 */
564SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
565	/* call the aes main loop */
566	do_aes_ctrmain KEY_256
567
568SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
569