190be188bSNathan Huckleberry/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
222cddcc7Schandramouli narayanan/*
390be188bSNathan Huckleberry * AES CTR mode by8 optimization with AVX instructions. (x86_64)
422cddcc7Schandramouli narayanan *
522cddcc7Schandramouli narayanan * Copyright(c) 2014 Intel Corporation.
622cddcc7Schandramouli narayanan *
722cddcc7Schandramouli narayanan * Contact Information:
822cddcc7Schandramouli narayanan * James Guilford <james.guilford@intel.com>
922cddcc7Schandramouli narayanan * Sean Gulley <sean.m.gulley@intel.com>
1022cddcc7Schandramouli narayanan * Chandramouli Narayanan <mouli@linux.intel.com>
1190be188bSNathan Huckleberry */
1290be188bSNathan Huckleberry/*
1390be188bSNathan Huckleberry * This is AES128/192/256 CTR mode optimization implementation. It requires
1490be188bSNathan Huckleberry * the support of Intel(R) AESNI and AVX instructions.
1522cddcc7Schandramouli narayanan *
1690be188bSNathan Huckleberry * This work was inspired by the AES CTR mode optimization published
1790be188bSNathan Huckleberry * in Intel Optimized IPSEC Cryptographic library.
1890be188bSNathan Huckleberry * Additional information on it can be found at:
1990be188bSNathan Huckleberry *    https://github.com/intel/intel-ipsec-mb
2022cddcc7Schandramouli narayanan */
2122cddcc7Schandramouli narayanan
2222cddcc7Schandramouli narayanan#include <linux/linkage.h>
2322cddcc7Schandramouli narayanan
2422cddcc7Schandramouli narayanan#define VMOVDQ		vmovdqu
2522cddcc7Schandramouli narayanan
26*fd94fcf0SNathan Huckleberry/*
27*fd94fcf0SNathan Huckleberry * Note: the "x" prefix in these aliases means "this is an xmm register".  The
28*fd94fcf0SNathan Huckleberry * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
29*fd94fcf0SNathan Huckleberry * counter".
30*fd94fcf0SNathan Huckleberry */
3122cddcc7Schandramouli narayanan#define xdata0		%xmm0
3222cddcc7Schandramouli narayanan#define xdata1		%xmm1
3322cddcc7Schandramouli narayanan#define xdata2		%xmm2
3422cddcc7Schandramouli narayanan#define xdata3		%xmm3
3522cddcc7Schandramouli narayanan#define xdata4		%xmm4
3622cddcc7Schandramouli narayanan#define xdata5		%xmm5
3722cddcc7Schandramouli narayanan#define xdata6		%xmm6
3822cddcc7Schandramouli narayanan#define xdata7		%xmm7
39*fd94fcf0SNathan Huckleberry#define xcounter	%xmm8	// CTR mode only
40*fd94fcf0SNathan Huckleberry#define xiv		%xmm8	// XCTR mode only
41*fd94fcf0SNathan Huckleberry#define xbyteswap	%xmm9	// CTR mode only
42*fd94fcf0SNathan Huckleberry#define xtmp		%xmm9	// XCTR mode only
4322cddcc7Schandramouli narayanan#define xkey0		%xmm10
4422cddcc7Schandramouli narayanan#define xkey4		%xmm11
4522cddcc7Schandramouli narayanan#define xkey8		%xmm12
4622cddcc7Schandramouli narayanan#define xkey12		%xmm13
4722cddcc7Schandramouli narayanan#define xkeyA		%xmm14
4822cddcc7Schandramouli narayanan#define xkeyB		%xmm15
4922cddcc7Schandramouli narayanan
5022cddcc7Schandramouli narayanan#define p_in		%rdi
5122cddcc7Schandramouli narayanan#define p_iv		%rsi
5222cddcc7Schandramouli narayanan#define p_keys		%rdx
5322cddcc7Schandramouli narayanan#define p_out		%rcx
5422cddcc7Schandramouli narayanan#define num_bytes	%r8
55*fd94fcf0SNathan Huckleberry#define counter		%r9	// XCTR mode only
5622cddcc7Schandramouli narayanan#define tmp		%r10
5722cddcc7Schandramouli narayanan#define	DDQ_DATA	0
5822cddcc7Schandramouli narayanan#define	XDATA		1
5922cddcc7Schandramouli narayanan#define KEY_128		1
6022cddcc7Schandramouli narayanan#define KEY_192		2
6122cddcc7Schandramouli narayanan#define KEY_256		3
6222cddcc7Schandramouli narayanan
6322cddcc7Schandramouli narayanan.section .rodata
6422cddcc7Schandramouli narayanan.align 16
6522cddcc7Schandramouli narayanan
6622cddcc7Schandramouli narayananbyteswap_const:
6722cddcc7Schandramouli narayanan	.octa 0x000102030405060708090A0B0C0D0E0F
6880dca473SMathias Krauseddq_low_msk:
6980dca473SMathias Krause	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
7080dca473SMathias Krauseddq_high_add_1:
7180dca473SMathias Krause	.octa 0x00000000000000010000000000000000
7222cddcc7Schandramouli narayananddq_add_1:
7322cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000001
7422cddcc7Schandramouli narayananddq_add_2:
7522cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000002
7622cddcc7Schandramouli narayananddq_add_3:
7722cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000003
7822cddcc7Schandramouli narayananddq_add_4:
7922cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000004
8022cddcc7Schandramouli narayananddq_add_5:
8122cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000005
8222cddcc7Schandramouli narayananddq_add_6:
8322cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000006
8422cddcc7Schandramouli narayananddq_add_7:
8522cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000007
8622cddcc7Schandramouli narayananddq_add_8:
8722cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000008
8822cddcc7Schandramouli narayanan
8922cddcc7Schandramouli narayanan.text
9022cddcc7Schandramouli narayanan
9122cddcc7Schandramouli narayanan/* generate a unique variable for ddq_add_x */
9222cddcc7Schandramouli narayanan
9322cddcc7Schandramouli narayanan/* generate a unique variable for xmm register */
9422cddcc7Schandramouli narayanan.macro setxdata n
95fdb2726fSMichael Davidson	var_xdata = %xmm\n
9622cddcc7Schandramouli narayanan.endm
9722cddcc7Schandramouli narayanan
9822cddcc7Schandramouli narayanan/* club the numeric 'id' to the symbol 'name' */
9922cddcc7Schandramouli narayanan
10022cddcc7Schandramouli narayanan.macro club name, id
10122cddcc7Schandramouli narayanan.altmacro
10244069737SJian Cai	.if \name == XDATA
10322cddcc7Schandramouli narayanan		setxdata %\id
10422cddcc7Schandramouli narayanan	.endif
10522cddcc7Schandramouli narayanan.noaltmacro
10622cddcc7Schandramouli narayanan.endm
10722cddcc7Schandramouli narayanan
10822cddcc7Schandramouli narayanan/*
10922cddcc7Schandramouli narayanan * do_aes num_in_par load_keys key_len
11022cddcc7Schandramouli narayanan * This increments p_in, but not p_out
11122cddcc7Schandramouli narayanan */
112*fd94fcf0SNathan Huckleberry.macro do_aes b, k, key_len, xctr
11322cddcc7Schandramouli narayanan	.set by, \b
11422cddcc7Schandramouli narayanan	.set load_keys, \k
11522cddcc7Schandramouli narayanan	.set klen, \key_len
11622cddcc7Schandramouli narayanan
11722cddcc7Schandramouli narayanan	.if (load_keys)
11822cddcc7Schandramouli narayanan		vmovdqa	0*16(p_keys), xkey0
11922cddcc7Schandramouli narayanan	.endif
12022cddcc7Schandramouli narayanan
121*fd94fcf0SNathan Huckleberry	.if \xctr
122*fd94fcf0SNathan Huckleberry		movq counter, xtmp
123*fd94fcf0SNathan Huckleberry		.set i, 0
124*fd94fcf0SNathan Huckleberry		.rept (by)
125*fd94fcf0SNathan Huckleberry			club XDATA, i
126*fd94fcf0SNathan Huckleberry			vpaddq	(ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
127*fd94fcf0SNathan Huckleberry			.set i, (i +1)
128*fd94fcf0SNathan Huckleberry		.endr
129*fd94fcf0SNathan Huckleberry		.set i, 0
130*fd94fcf0SNathan Huckleberry		.rept (by)
131*fd94fcf0SNathan Huckleberry			club	XDATA, i
132*fd94fcf0SNathan Huckleberry			vpxor	xiv, var_xdata, var_xdata
133*fd94fcf0SNathan Huckleberry			.set i, (i +1)
134*fd94fcf0SNathan Huckleberry		.endr
135*fd94fcf0SNathan Huckleberry	.else
13622cddcc7Schandramouli narayanan		vpshufb	xbyteswap, xcounter, xdata0
13722cddcc7Schandramouli narayanan		.set i, 1
13822cddcc7Schandramouli narayanan		.rept (by - 1)
13922cddcc7Schandramouli narayanan			club XDATA, i
14044069737SJian Cai			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
14180dca473SMathias Krause			vptest	ddq_low_msk(%rip), var_xdata
14280dca473SMathias Krause			jnz 1f
14380dca473SMathias Krause			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
14480dca473SMathias Krause			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
14580dca473SMathias Krause			1:
14622cddcc7Schandramouli narayanan			vpshufb	xbyteswap, var_xdata, var_xdata
14722cddcc7Schandramouli narayanan			.set i, (i +1)
14822cddcc7Schandramouli narayanan		.endr
149*fd94fcf0SNathan Huckleberry	.endif
15022cddcc7Schandramouli narayanan
15122cddcc7Schandramouli narayanan	vmovdqa	1*16(p_keys), xkeyA
15222cddcc7Schandramouli narayanan
15322cddcc7Schandramouli narayanan	vpxor	xkey0, xdata0, xdata0
154*fd94fcf0SNathan Huckleberry	.if \xctr
155*fd94fcf0SNathan Huckleberry		add $by, counter
156*fd94fcf0SNathan Huckleberry	.else
15744069737SJian Cai		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
15880dca473SMathias Krause		vptest	ddq_low_msk(%rip), xcounter
15980dca473SMathias Krause		jnz	1f
16080dca473SMathias Krause		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
16180dca473SMathias Krause		1:
162*fd94fcf0SNathan Huckleberry	.endif
16322cddcc7Schandramouli narayanan
16422cddcc7Schandramouli narayanan	.set i, 1
16522cddcc7Schandramouli narayanan	.rept (by - 1)
16622cddcc7Schandramouli narayanan		club XDATA, i
16722cddcc7Schandramouli narayanan		vpxor	xkey0, var_xdata, var_xdata
16822cddcc7Schandramouli narayanan		.set i, (i +1)
16922cddcc7Schandramouli narayanan	.endr
17022cddcc7Schandramouli narayanan
17122cddcc7Schandramouli narayanan	vmovdqa	2*16(p_keys), xkeyB
17222cddcc7Schandramouli narayanan
17322cddcc7Schandramouli narayanan	.set i, 0
17422cddcc7Schandramouli narayanan	.rept by
17522cddcc7Schandramouli narayanan		club XDATA, i
17622cddcc7Schandramouli narayanan		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
17722cddcc7Schandramouli narayanan		.set i, (i +1)
17822cddcc7Schandramouli narayanan	.endr
17922cddcc7Schandramouli narayanan
18022cddcc7Schandramouli narayanan	.if (klen == KEY_128)
18122cddcc7Schandramouli narayanan		.if (load_keys)
1820b1e95b2SMathias Krause			vmovdqa	3*16(p_keys), xkey4
18322cddcc7Schandramouli narayanan		.endif
18422cddcc7Schandramouli narayanan	.else
18522cddcc7Schandramouli narayanan		vmovdqa	3*16(p_keys), xkeyA
18622cddcc7Schandramouli narayanan	.endif
18722cddcc7Schandramouli narayanan
18822cddcc7Schandramouli narayanan	.set i, 0
18922cddcc7Schandramouli narayanan	.rept by
19022cddcc7Schandramouli narayanan		club XDATA, i
19122cddcc7Schandramouli narayanan		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
19222cddcc7Schandramouli narayanan		.set i, (i +1)
19322cddcc7Schandramouli narayanan	.endr
19422cddcc7Schandramouli narayanan
19522cddcc7Schandramouli narayanan	add	$(16*by), p_in
19622cddcc7Schandramouli narayanan
19722cddcc7Schandramouli narayanan	.if (klen == KEY_128)
1980b1e95b2SMathias Krause		vmovdqa	4*16(p_keys), xkeyB
19922cddcc7Schandramouli narayanan	.else
20022cddcc7Schandramouli narayanan		.if (load_keys)
20122cddcc7Schandramouli narayanan			vmovdqa	4*16(p_keys), xkey4
20222cddcc7Schandramouli narayanan		.endif
20322cddcc7Schandramouli narayanan	.endif
20422cddcc7Schandramouli narayanan
20522cddcc7Schandramouli narayanan	.set i, 0
20622cddcc7Schandramouli narayanan	.rept by
20722cddcc7Schandramouli narayanan		club XDATA, i
2080b1e95b2SMathias Krause		/* key 3 */
2090b1e95b2SMathias Krause		.if (klen == KEY_128)
2100b1e95b2SMathias Krause			vaesenc	xkey4, var_xdata, var_xdata
2110b1e95b2SMathias Krause		.else
2120b1e95b2SMathias Krause			vaesenc	xkeyA, var_xdata, var_xdata
2130b1e95b2SMathias Krause		.endif
21422cddcc7Schandramouli narayanan		.set i, (i +1)
21522cddcc7Schandramouli narayanan	.endr
21622cddcc7Schandramouli narayanan
21722cddcc7Schandramouli narayanan	vmovdqa	5*16(p_keys), xkeyA
21822cddcc7Schandramouli narayanan
21922cddcc7Schandramouli narayanan	.set i, 0
22022cddcc7Schandramouli narayanan	.rept by
22122cddcc7Schandramouli narayanan		club XDATA, i
2220b1e95b2SMathias Krause		/* key 4 */
2230b1e95b2SMathias Krause		.if (klen == KEY_128)
2240b1e95b2SMathias Krause			vaesenc	xkeyB, var_xdata, var_xdata
2250b1e95b2SMathias Krause		.else
2260b1e95b2SMathias Krause			vaesenc	xkey4, var_xdata, var_xdata
2270b1e95b2SMathias Krause		.endif
22822cddcc7Schandramouli narayanan		.set i, (i +1)
22922cddcc7Schandramouli narayanan	.endr
23022cddcc7Schandramouli narayanan
23122cddcc7Schandramouli narayanan	.if (klen == KEY_128)
23222cddcc7Schandramouli narayanan		.if (load_keys)
2330b1e95b2SMathias Krause			vmovdqa	6*16(p_keys), xkey8
23422cddcc7Schandramouli narayanan		.endif
23522cddcc7Schandramouli narayanan	.else
23622cddcc7Schandramouli narayanan		vmovdqa	6*16(p_keys), xkeyB
23722cddcc7Schandramouli narayanan	.endif
23822cddcc7Schandramouli narayanan
23922cddcc7Schandramouli narayanan	.set i, 0
24022cddcc7Schandramouli narayanan	.rept by
24122cddcc7Schandramouli narayanan		club XDATA, i
24222cddcc7Schandramouli narayanan		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
24322cddcc7Schandramouli narayanan		.set i, (i +1)
24422cddcc7Schandramouli narayanan	.endr
24522cddcc7Schandramouli narayanan
24622cddcc7Schandramouli narayanan	vmovdqa	7*16(p_keys), xkeyA
24722cddcc7Schandramouli narayanan
24822cddcc7Schandramouli narayanan	.set i, 0
24922cddcc7Schandramouli narayanan	.rept by
25022cddcc7Schandramouli narayanan		club XDATA, i
2510b1e95b2SMathias Krause		/* key 6 */
2520b1e95b2SMathias Krause		.if (klen == KEY_128)
2530b1e95b2SMathias Krause			vaesenc	xkey8, var_xdata, var_xdata
2540b1e95b2SMathias Krause		.else
2550b1e95b2SMathias Krause			vaesenc	xkeyB, var_xdata, var_xdata
2560b1e95b2SMathias Krause		.endif
25722cddcc7Schandramouli narayanan		.set i, (i +1)
25822cddcc7Schandramouli narayanan	.endr
25922cddcc7Schandramouli narayanan
26022cddcc7Schandramouli narayanan	.if (klen == KEY_128)
2610b1e95b2SMathias Krause		vmovdqa	8*16(p_keys), xkeyB
26222cddcc7Schandramouli narayanan	.else
26322cddcc7Schandramouli narayanan		.if (load_keys)
26422cddcc7Schandramouli narayanan			vmovdqa	8*16(p_keys), xkey8
26522cddcc7Schandramouli narayanan		.endif
26622cddcc7Schandramouli narayanan	.endif
26722cddcc7Schandramouli narayanan
26822cddcc7Schandramouli narayanan	.set i, 0
26922cddcc7Schandramouli narayanan	.rept by
27022cddcc7Schandramouli narayanan		club XDATA, i
27122cddcc7Schandramouli narayanan		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
27222cddcc7Schandramouli narayanan		.set i, (i +1)
27322cddcc7Schandramouli narayanan	.endr
27422cddcc7Schandramouli narayanan
27522cddcc7Schandramouli narayanan	.if (klen == KEY_128)
27622cddcc7Schandramouli narayanan		.if (load_keys)
2770b1e95b2SMathias Krause			vmovdqa	9*16(p_keys), xkey12
27822cddcc7Schandramouli narayanan		.endif
27922cddcc7Schandramouli narayanan	.else
28022cddcc7Schandramouli narayanan		vmovdqa	9*16(p_keys), xkeyA
28122cddcc7Schandramouli narayanan	.endif
28222cddcc7Schandramouli narayanan
28322cddcc7Schandramouli narayanan	.set i, 0
28422cddcc7Schandramouli narayanan	.rept by
28522cddcc7Schandramouli narayanan		club XDATA, i
2860b1e95b2SMathias Krause		/* key 8 */
2870b1e95b2SMathias Krause		.if (klen == KEY_128)
2880b1e95b2SMathias Krause			vaesenc	xkeyB, var_xdata, var_xdata
2890b1e95b2SMathias Krause		.else
2900b1e95b2SMathias Krause			vaesenc	xkey8, var_xdata, var_xdata
2910b1e95b2SMathias Krause		.endif
29222cddcc7Schandramouli narayanan		.set i, (i +1)
29322cddcc7Schandramouli narayanan	.endr
29422cddcc7Schandramouli narayanan
29522cddcc7Schandramouli narayanan	vmovdqa	10*16(p_keys), xkeyB
29622cddcc7Schandramouli narayanan
29722cddcc7Schandramouli narayanan	.set i, 0
29822cddcc7Schandramouli narayanan	.rept by
29922cddcc7Schandramouli narayanan		club XDATA, i
3000b1e95b2SMathias Krause		/* key 9 */
3010b1e95b2SMathias Krause		.if (klen == KEY_128)
3020b1e95b2SMathias Krause			vaesenc	xkey12, var_xdata, var_xdata
3030b1e95b2SMathias Krause		.else
3040b1e95b2SMathias Krause			vaesenc	xkeyA, var_xdata, var_xdata
3050b1e95b2SMathias Krause		.endif
30622cddcc7Schandramouli narayanan		.set i, (i +1)
30722cddcc7Schandramouli narayanan	.endr
30822cddcc7Schandramouli narayanan
30922cddcc7Schandramouli narayanan	.if (klen != KEY_128)
31022cddcc7Schandramouli narayanan		vmovdqa	11*16(p_keys), xkeyA
31122cddcc7Schandramouli narayanan	.endif
31222cddcc7Schandramouli narayanan
31322cddcc7Schandramouli narayanan	.set i, 0
31422cddcc7Schandramouli narayanan	.rept by
31522cddcc7Schandramouli narayanan		club XDATA, i
31622cddcc7Schandramouli narayanan		/* key 10 */
31722cddcc7Schandramouli narayanan		.if (klen == KEY_128)
31822cddcc7Schandramouli narayanan			vaesenclast	xkeyB, var_xdata, var_xdata
31922cddcc7Schandramouli narayanan		.else
32022cddcc7Schandramouli narayanan			vaesenc	xkeyB, var_xdata, var_xdata
32122cddcc7Schandramouli narayanan		.endif
32222cddcc7Schandramouli narayanan		.set i, (i +1)
32322cddcc7Schandramouli narayanan	.endr
32422cddcc7Schandramouli narayanan
32522cddcc7Schandramouli narayanan	.if (klen != KEY_128)
32622cddcc7Schandramouli narayanan		.if (load_keys)
32722cddcc7Schandramouli narayanan			vmovdqa	12*16(p_keys), xkey12
32822cddcc7Schandramouli narayanan		.endif
32922cddcc7Schandramouli narayanan
33022cddcc7Schandramouli narayanan		.set i, 0
33122cddcc7Schandramouli narayanan		.rept by
33222cddcc7Schandramouli narayanan			club XDATA, i
33322cddcc7Schandramouli narayanan			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
33422cddcc7Schandramouli narayanan			.set i, (i +1)
33522cddcc7Schandramouli narayanan		.endr
33622cddcc7Schandramouli narayanan
33722cddcc7Schandramouli narayanan		.if (klen == KEY_256)
33822cddcc7Schandramouli narayanan			vmovdqa	13*16(p_keys), xkeyA
33922cddcc7Schandramouli narayanan		.endif
34022cddcc7Schandramouli narayanan
34122cddcc7Schandramouli narayanan		.set i, 0
34222cddcc7Schandramouli narayanan		.rept by
34322cddcc7Schandramouli narayanan			club XDATA, i
34422cddcc7Schandramouli narayanan			.if (klen == KEY_256)
34522cddcc7Schandramouli narayanan				/* key 12 */
34622cddcc7Schandramouli narayanan				vaesenc	xkey12, var_xdata, var_xdata
34722cddcc7Schandramouli narayanan			.else
34822cddcc7Schandramouli narayanan				vaesenclast xkey12, var_xdata, var_xdata
34922cddcc7Schandramouli narayanan			.endif
35022cddcc7Schandramouli narayanan			.set i, (i +1)
35122cddcc7Schandramouli narayanan		.endr
35222cddcc7Schandramouli narayanan
35322cddcc7Schandramouli narayanan		.if (klen == KEY_256)
35422cddcc7Schandramouli narayanan			vmovdqa	14*16(p_keys), xkeyB
35522cddcc7Schandramouli narayanan
35622cddcc7Schandramouli narayanan			.set i, 0
35722cddcc7Schandramouli narayanan			.rept by
35822cddcc7Schandramouli narayanan				club XDATA, i
35922cddcc7Schandramouli narayanan				/* key 13 */
36022cddcc7Schandramouli narayanan				vaesenc	xkeyA, var_xdata, var_xdata
36122cddcc7Schandramouli narayanan				.set i, (i +1)
36222cddcc7Schandramouli narayanan			.endr
36322cddcc7Schandramouli narayanan
36422cddcc7Schandramouli narayanan			.set i, 0
36522cddcc7Schandramouli narayanan			.rept by
36622cddcc7Schandramouli narayanan				club XDATA, i
36722cddcc7Schandramouli narayanan				/* key 14 */
36822cddcc7Schandramouli narayanan				vaesenclast	xkeyB, var_xdata, var_xdata
36922cddcc7Schandramouli narayanan				.set i, (i +1)
37022cddcc7Schandramouli narayanan			.endr
37122cddcc7Schandramouli narayanan		.endif
37222cddcc7Schandramouli narayanan	.endif
37322cddcc7Schandramouli narayanan
37422cddcc7Schandramouli narayanan	.set i, 0
37522cddcc7Schandramouli narayanan	.rept (by / 2)
37622cddcc7Schandramouli narayanan		.set j, (i+1)
37722cddcc7Schandramouli narayanan		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
37822cddcc7Schandramouli narayanan		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
37922cddcc7Schandramouli narayanan		club XDATA, i
38022cddcc7Schandramouli narayanan		vpxor	xkeyA, var_xdata, var_xdata
38122cddcc7Schandramouli narayanan		club XDATA, j
38222cddcc7Schandramouli narayanan		vpxor	xkeyB, var_xdata, var_xdata
38322cddcc7Schandramouli narayanan		.set i, (i+2)
38422cddcc7Schandramouli narayanan	.endr
38522cddcc7Schandramouli narayanan
38622cddcc7Schandramouli narayanan	.if (i < by)
38722cddcc7Schandramouli narayanan		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
38822cddcc7Schandramouli narayanan		club XDATA, i
38922cddcc7Schandramouli narayanan		vpxor	xkeyA, var_xdata, var_xdata
39022cddcc7Schandramouli narayanan	.endif
39122cddcc7Schandramouli narayanan
39222cddcc7Schandramouli narayanan	.set i, 0
39322cddcc7Schandramouli narayanan	.rept by
39422cddcc7Schandramouli narayanan		club XDATA, i
39522cddcc7Schandramouli narayanan		VMOVDQ	var_xdata, i*16(p_out)
39622cddcc7Schandramouli narayanan		.set i, (i+1)
39722cddcc7Schandramouli narayanan	.endr
39822cddcc7Schandramouli narayanan.endm
39922cddcc7Schandramouli narayanan
400*fd94fcf0SNathan Huckleberry.macro do_aes_load val, key_len, xctr
401*fd94fcf0SNathan Huckleberry	do_aes \val, 1, \key_len, \xctr
40222cddcc7Schandramouli narayanan.endm
40322cddcc7Schandramouli narayanan
404*fd94fcf0SNathan Huckleberry.macro do_aes_noload val, key_len, xctr
405*fd94fcf0SNathan Huckleberry	do_aes \val, 0, \key_len, \xctr
40622cddcc7Schandramouli narayanan.endm
40722cddcc7Schandramouli narayanan
40822cddcc7Schandramouli narayanan/* main body of aes ctr load */
40922cddcc7Schandramouli narayanan
410*fd94fcf0SNathan Huckleberry.macro do_aes_ctrmain key_len, xctr
41122cddcc7Schandramouli narayanan	cmp	$16, num_bytes
412*fd94fcf0SNathan Huckleberry	jb	.Ldo_return2\xctr\key_len
41322cddcc7Schandramouli narayanan
414*fd94fcf0SNathan Huckleberry	.if \xctr
415*fd94fcf0SNathan Huckleberry		shr	$4, counter
416*fd94fcf0SNathan Huckleberry		vmovdqu	(p_iv), xiv
417*fd94fcf0SNathan Huckleberry	.else
41822cddcc7Schandramouli narayanan		vmovdqa	byteswap_const(%rip), xbyteswap
41922cddcc7Schandramouli narayanan		vmovdqu	(p_iv), xcounter
42022cddcc7Schandramouli narayanan		vpshufb	xbyteswap, xcounter, xcounter
421*fd94fcf0SNathan Huckleberry	.endif
42222cddcc7Schandramouli narayanan
42322cddcc7Schandramouli narayanan	mov	num_bytes, tmp
42422cddcc7Schandramouli narayanan	and	$(7*16), tmp
425*fd94fcf0SNathan Huckleberry	jz	.Lmult_of_8_blks\xctr\key_len
42622cddcc7Schandramouli narayanan
42722cddcc7Schandramouli narayanan	/* 1 <= tmp <= 7 */
42822cddcc7Schandramouli narayanan	cmp	$(4*16), tmp
429*fd94fcf0SNathan Huckleberry	jg	.Lgt4\xctr\key_len
430*fd94fcf0SNathan Huckleberry	je	.Leq4\xctr\key_len
43122cddcc7Schandramouli narayanan
432*fd94fcf0SNathan Huckleberry.Llt4\xctr\key_len:
43322cddcc7Schandramouli narayanan	cmp	$(2*16), tmp
434*fd94fcf0SNathan Huckleberry	jg	.Leq3\xctr\key_len
435*fd94fcf0SNathan Huckleberry	je	.Leq2\xctr\key_len
43622cddcc7Schandramouli narayanan
437*fd94fcf0SNathan Huckleberry.Leq1\xctr\key_len:
438*fd94fcf0SNathan Huckleberry	do_aes_load	1, \key_len, \xctr
43922cddcc7Schandramouli narayanan	add	$(1*16), p_out
44022cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
441*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
442*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
44322cddcc7Schandramouli narayanan
444*fd94fcf0SNathan Huckleberry.Leq2\xctr\key_len:
445*fd94fcf0SNathan Huckleberry	do_aes_load	2, \key_len, \xctr
44622cddcc7Schandramouli narayanan	add	$(2*16), p_out
44722cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
448*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
449*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
45022cddcc7Schandramouli narayanan
45122cddcc7Schandramouli narayanan
452*fd94fcf0SNathan Huckleberry.Leq3\xctr\key_len:
453*fd94fcf0SNathan Huckleberry	do_aes_load	3, \key_len, \xctr
45422cddcc7Schandramouli narayanan	add	$(3*16), p_out
45522cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
456*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
457*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
45822cddcc7Schandramouli narayanan
459*fd94fcf0SNathan Huckleberry.Leq4\xctr\key_len:
460*fd94fcf0SNathan Huckleberry	do_aes_load	4, \key_len, \xctr
46122cddcc7Schandramouli narayanan	add	$(4*16), p_out
46222cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
463*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
464*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
46522cddcc7Schandramouli narayanan
466*fd94fcf0SNathan Huckleberry.Lgt4\xctr\key_len:
46722cddcc7Schandramouli narayanan	cmp	$(6*16), tmp
468*fd94fcf0SNathan Huckleberry	jg	.Leq7\xctr\key_len
469*fd94fcf0SNathan Huckleberry	je	.Leq6\xctr\key_len
47022cddcc7Schandramouli narayanan
471*fd94fcf0SNathan Huckleberry.Leq5\xctr\key_len:
472*fd94fcf0SNathan Huckleberry	do_aes_load	5, \key_len, \xctr
47322cddcc7Schandramouli narayanan	add	$(5*16), p_out
47422cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
475*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
476*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
47722cddcc7Schandramouli narayanan
478*fd94fcf0SNathan Huckleberry.Leq6\xctr\key_len:
479*fd94fcf0SNathan Huckleberry	do_aes_load	6, \key_len, \xctr
48022cddcc7Schandramouli narayanan	add	$(6*16), p_out
48122cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
482*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
483*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
48422cddcc7Schandramouli narayanan
485*fd94fcf0SNathan Huckleberry.Leq7\xctr\key_len:
486*fd94fcf0SNathan Huckleberry	do_aes_load	7, \key_len, \xctr
48722cddcc7Schandramouli narayanan	add	$(7*16), p_out
48822cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
489*fd94fcf0SNathan Huckleberry	jz	.Ldo_return2\xctr\key_len
490*fd94fcf0SNathan Huckleberry	jmp	.Lmain_loop2\xctr\key_len
49122cddcc7Schandramouli narayanan
492*fd94fcf0SNathan Huckleberry.Lmult_of_8_blks\xctr\key_len:
49322cddcc7Schandramouli narayanan	.if (\key_len != KEY_128)
49422cddcc7Schandramouli narayanan		vmovdqa	0*16(p_keys), xkey0
49522cddcc7Schandramouli narayanan		vmovdqa	4*16(p_keys), xkey4
49622cddcc7Schandramouli narayanan		vmovdqa	8*16(p_keys), xkey8
49722cddcc7Schandramouli narayanan		vmovdqa	12*16(p_keys), xkey12
49822cddcc7Schandramouli narayanan	.else
49922cddcc7Schandramouli narayanan		vmovdqa	0*16(p_keys), xkey0
50022cddcc7Schandramouli narayanan		vmovdqa	3*16(p_keys), xkey4
50122cddcc7Schandramouli narayanan		vmovdqa	6*16(p_keys), xkey8
50222cddcc7Schandramouli narayanan		vmovdqa	9*16(p_keys), xkey12
50322cddcc7Schandramouli narayanan	.endif
50422cddcc7Schandramouli narayanan.align 16
505*fd94fcf0SNathan Huckleberry.Lmain_loop2\xctr\key_len:
50622cddcc7Schandramouli narayanan	/* num_bytes is a multiple of 8 and >0 */
507*fd94fcf0SNathan Huckleberry	do_aes_noload	8, \key_len, \xctr
50822cddcc7Schandramouli narayanan	add	$(8*16), p_out
50922cddcc7Schandramouli narayanan	sub	$(8*16), num_bytes
510*fd94fcf0SNathan Huckleberry	jne	.Lmain_loop2\xctr\key_len
51122cddcc7Schandramouli narayanan
512*fd94fcf0SNathan Huckleberry.Ldo_return2\xctr\key_len:
513*fd94fcf0SNathan Huckleberry	.if !\xctr
51422cddcc7Schandramouli narayanan		/* return updated IV */
51522cddcc7Schandramouli narayanan		vpshufb	xbyteswap, xcounter, xcounter
51622cddcc7Schandramouli narayanan		vmovdqu	xcounter, (p_iv)
517*fd94fcf0SNathan Huckleberry	.endif
518f94909ceSPeter Zijlstra	RET
51922cddcc7Schandramouli narayanan.endm
52022cddcc7Schandramouli narayanan
52122cddcc7Schandramouli narayanan/*
52222cddcc7Schandramouli narayanan * routine to do AES128 CTR enc/decrypt "by8"
52322cddcc7Schandramouli narayanan * XMM registers are clobbered.
52422cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level
52522cddcc7Schandramouli narayanan * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
52622cddcc7Schandramouli narayanan *			unsigned int num_bytes)
52722cddcc7Schandramouli narayanan */
5286dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_128_avx_by8)
52922cddcc7Schandramouli narayanan	/* call the aes main loop */
530*fd94fcf0SNathan Huckleberry	do_aes_ctrmain KEY_128 0
53122cddcc7Schandramouli narayanan
5326dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_128_avx_by8)
53322cddcc7Schandramouli narayanan
53422cddcc7Schandramouli narayanan/*
53522cddcc7Schandramouli narayanan * routine to do AES192 CTR enc/decrypt "by8"
53622cddcc7Schandramouli narayanan * XMM registers are clobbered.
53722cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level
53822cddcc7Schandramouli narayanan * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
53922cddcc7Schandramouli narayanan *			unsigned int num_bytes)
54022cddcc7Schandramouli narayanan */
5416dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_192_avx_by8)
54222cddcc7Schandramouli narayanan	/* call the aes main loop */
543*fd94fcf0SNathan Huckleberry	do_aes_ctrmain KEY_192 0
54422cddcc7Schandramouli narayanan
5456dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_192_avx_by8)
54622cddcc7Schandramouli narayanan
54722cddcc7Schandramouli narayanan/*
54822cddcc7Schandramouli narayanan * routine to do AES256 CTR enc/decrypt "by8"
54922cddcc7Schandramouli narayanan * XMM registers are clobbered.
55022cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level
55122cddcc7Schandramouli narayanan * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
55222cddcc7Schandramouli narayanan *			unsigned int num_bytes)
55322cddcc7Schandramouli narayanan */
5546dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_256_avx_by8)
55522cddcc7Schandramouli narayanan	/* call the aes main loop */
556*fd94fcf0SNathan Huckleberry	do_aes_ctrmain KEY_256 0
55722cddcc7Schandramouli narayanan
5586dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_256_avx_by8)
559*fd94fcf0SNathan Huckleberry
560*fd94fcf0SNathan Huckleberry/*
561*fd94fcf0SNathan Huckleberry * routine to do AES128 XCTR enc/decrypt "by8"
562*fd94fcf0SNathan Huckleberry * XMM registers are clobbered.
563*fd94fcf0SNathan Huckleberry * Saving/restoring must be done at a higher level
564*fd94fcf0SNathan Huckleberry * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
565*fd94fcf0SNathan Huckleberry * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
566*fd94fcf0SNathan Huckleberry */
567*fd94fcf0SNathan HuckleberrySYM_FUNC_START(aes_xctr_enc_128_avx_by8)
568*fd94fcf0SNathan Huckleberry	/* call the aes main loop */
569*fd94fcf0SNathan Huckleberry	do_aes_ctrmain KEY_128 1
570*fd94fcf0SNathan Huckleberry
571*fd94fcf0SNathan HuckleberrySYM_FUNC_END(aes_xctr_enc_128_avx_by8)
572*fd94fcf0SNathan Huckleberry
573*fd94fcf0SNathan Huckleberry/*
574*fd94fcf0SNathan Huckleberry * routine to do AES192 XCTR enc/decrypt "by8"
575*fd94fcf0SNathan Huckleberry * XMM registers are clobbered.
576*fd94fcf0SNathan Huckleberry * Saving/restoring must be done at a higher level
577*fd94fcf0SNathan Huckleberry * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
578*fd94fcf0SNathan Huckleberry * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
579*fd94fcf0SNathan Huckleberry */
580*fd94fcf0SNathan HuckleberrySYM_FUNC_START(aes_xctr_enc_192_avx_by8)
581*fd94fcf0SNathan Huckleberry	/* call the aes main loop */
582*fd94fcf0SNathan Huckleberry	do_aes_ctrmain KEY_192 1
583*fd94fcf0SNathan Huckleberry
584*fd94fcf0SNathan HuckleberrySYM_FUNC_END(aes_xctr_enc_192_avx_by8)
585*fd94fcf0SNathan Huckleberry
586*fd94fcf0SNathan Huckleberry/*
587*fd94fcf0SNathan Huckleberry * routine to do AES256 XCTR enc/decrypt "by8"
588*fd94fcf0SNathan Huckleberry * XMM registers are clobbered.
589*fd94fcf0SNathan Huckleberry * Saving/restoring must be done at a higher level
590*fd94fcf0SNathan Huckleberry * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
591*fd94fcf0SNathan Huckleberry * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
592*fd94fcf0SNathan Huckleberry */
593*fd94fcf0SNathan HuckleberrySYM_FUNC_START(aes_xctr_enc_256_avx_by8)
594*fd94fcf0SNathan Huckleberry	/* call the aes main loop */
595*fd94fcf0SNathan Huckleberry	do_aes_ctrmain KEY_256 1
596*fd94fcf0SNathan Huckleberry
597*fd94fcf0SNathan HuckleberrySYM_FUNC_END(aes_xctr_enc_256_avx_by8)
598