1*90be188bSNathan Huckleberry/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
222cddcc7Schandramouli narayanan/*
3*90be188bSNathan Huckleberry * AES CTR mode by8 optimization with AVX instructions. (x86_64)
422cddcc7Schandramouli narayanan *
522cddcc7Schandramouli narayanan * Copyright(c) 2014 Intel Corporation.
622cddcc7Schandramouli narayanan *
722cddcc7Schandramouli narayanan * Contact Information:
822cddcc7Schandramouli narayanan * James Guilford <james.guilford@intel.com>
922cddcc7Schandramouli narayanan * Sean Gulley <sean.m.gulley@intel.com>
1022cddcc7Schandramouli narayanan * Chandramouli Narayanan <mouli@linux.intel.com>
11*90be188bSNathan Huckleberry */
12*90be188bSNathan Huckleberry/*
13*90be188bSNathan Huckleberry * This is AES128/192/256 CTR mode optimization implementation. It requires
14*90be188bSNathan Huckleberry * the support of Intel(R) AESNI and AVX instructions.
1522cddcc7Schandramouli narayanan *
16*90be188bSNathan Huckleberry * This work was inspired by the AES CTR mode optimization published
17*90be188bSNathan Huckleberry * in Intel Optimized IPSEC Cryptographic library.
18*90be188bSNathan Huckleberry * Additional information on it can be found at:
19*90be188bSNathan Huckleberry *    https://github.com/intel/intel-ipsec-mb
2022cddcc7Schandramouli narayanan */
2122cddcc7Schandramouli narayanan
2222cddcc7Schandramouli narayanan#include <linux/linkage.h>
2322cddcc7Schandramouli narayanan
2422cddcc7Schandramouli narayanan#define VMOVDQ		vmovdqu
2522cddcc7Schandramouli narayanan
2622cddcc7Schandramouli narayanan#define xdata0		%xmm0
2722cddcc7Schandramouli narayanan#define xdata1		%xmm1
2822cddcc7Schandramouli narayanan#define xdata2		%xmm2
2922cddcc7Schandramouli narayanan#define xdata3		%xmm3
3022cddcc7Schandramouli narayanan#define xdata4		%xmm4
3122cddcc7Schandramouli narayanan#define xdata5		%xmm5
3222cddcc7Schandramouli narayanan#define xdata6		%xmm6
3322cddcc7Schandramouli narayanan#define xdata7		%xmm7
3422cddcc7Schandramouli narayanan#define xcounter	%xmm8
3522cddcc7Schandramouli narayanan#define xbyteswap	%xmm9
3622cddcc7Schandramouli narayanan#define xkey0		%xmm10
3722cddcc7Schandramouli narayanan#define xkey4		%xmm11
3822cddcc7Schandramouli narayanan#define xkey8		%xmm12
3922cddcc7Schandramouli narayanan#define xkey12		%xmm13
4022cddcc7Schandramouli narayanan#define xkeyA		%xmm14
4122cddcc7Schandramouli narayanan#define xkeyB		%xmm15
4222cddcc7Schandramouli narayanan
4322cddcc7Schandramouli narayanan#define p_in		%rdi
4422cddcc7Schandramouli narayanan#define p_iv		%rsi
4522cddcc7Schandramouli narayanan#define p_keys		%rdx
4622cddcc7Schandramouli narayanan#define p_out		%rcx
4722cddcc7Schandramouli narayanan#define num_bytes	%r8
4822cddcc7Schandramouli narayanan
4922cddcc7Schandramouli narayanan#define tmp		%r10
5022cddcc7Schandramouli narayanan#define	DDQ_DATA	0
5122cddcc7Schandramouli narayanan#define	XDATA		1
5222cddcc7Schandramouli narayanan#define KEY_128		1
5322cddcc7Schandramouli narayanan#define KEY_192		2
5422cddcc7Schandramouli narayanan#define KEY_256		3
5522cddcc7Schandramouli narayanan
5622cddcc7Schandramouli narayanan.section .rodata
5722cddcc7Schandramouli narayanan.align 16
5822cddcc7Schandramouli narayanan
5922cddcc7Schandramouli narayananbyteswap_const:
6022cddcc7Schandramouli narayanan	.octa 0x000102030405060708090A0B0C0D0E0F
6180dca473SMathias Krauseddq_low_msk:
6280dca473SMathias Krause	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
6380dca473SMathias Krauseddq_high_add_1:
6480dca473SMathias Krause	.octa 0x00000000000000010000000000000000
6522cddcc7Schandramouli narayananddq_add_1:
6622cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000001
6722cddcc7Schandramouli narayananddq_add_2:
6822cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000002
6922cddcc7Schandramouli narayananddq_add_3:
7022cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000003
7122cddcc7Schandramouli narayananddq_add_4:
7222cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000004
7322cddcc7Schandramouli narayananddq_add_5:
7422cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000005
7522cddcc7Schandramouli narayananddq_add_6:
7622cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000006
7722cddcc7Schandramouli narayananddq_add_7:
7822cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000007
7922cddcc7Schandramouli narayananddq_add_8:
8022cddcc7Schandramouli narayanan	.octa 0x00000000000000000000000000000008
8122cddcc7Schandramouli narayanan
8222cddcc7Schandramouli narayanan.text
8322cddcc7Schandramouli narayanan
8422cddcc7Schandramouli narayanan/* generate a unique variable for ddq_add_x */
8522cddcc7Schandramouli narayanan
8622cddcc7Schandramouli narayanan/* generate a unique variable for xmm register */
8722cddcc7Schandramouli narayanan.macro setxdata n
88fdb2726fSMichael Davidson	var_xdata = %xmm\n
8922cddcc7Schandramouli narayanan.endm
9022cddcc7Schandramouli narayanan
9122cddcc7Schandramouli narayanan/* club the numeric 'id' to the symbol 'name' */
9222cddcc7Schandramouli narayanan
9322cddcc7Schandramouli narayanan.macro club name, id
9422cddcc7Schandramouli narayanan.altmacro
9544069737SJian Cai	.if \name == XDATA
9622cddcc7Schandramouli narayanan		setxdata %\id
9722cddcc7Schandramouli narayanan	.endif
9822cddcc7Schandramouli narayanan.noaltmacro
9922cddcc7Schandramouli narayanan.endm
10022cddcc7Schandramouli narayanan
10122cddcc7Schandramouli narayanan/*
10222cddcc7Schandramouli narayanan * do_aes num_in_par load_keys key_len
10322cddcc7Schandramouli narayanan * This increments p_in, but not p_out
10422cddcc7Schandramouli narayanan */
10522cddcc7Schandramouli narayanan.macro do_aes b, k, key_len
10622cddcc7Schandramouli narayanan	.set by, \b
10722cddcc7Schandramouli narayanan	.set load_keys, \k
10822cddcc7Schandramouli narayanan	.set klen, \key_len
10922cddcc7Schandramouli narayanan
11022cddcc7Schandramouli narayanan	.if (load_keys)
11122cddcc7Schandramouli narayanan		vmovdqa	0*16(p_keys), xkey0
11222cddcc7Schandramouli narayanan	.endif
11322cddcc7Schandramouli narayanan
11422cddcc7Schandramouli narayanan	vpshufb	xbyteswap, xcounter, xdata0
11522cddcc7Schandramouli narayanan
11622cddcc7Schandramouli narayanan	.set i, 1
11722cddcc7Schandramouli narayanan	.rept (by - 1)
11822cddcc7Schandramouli narayanan		club XDATA, i
11944069737SJian Cai		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
12080dca473SMathias Krause		vptest	ddq_low_msk(%rip), var_xdata
12180dca473SMathias Krause		jnz 1f
12280dca473SMathias Krause		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
12380dca473SMathias Krause		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
12480dca473SMathias Krause		1:
12522cddcc7Schandramouli narayanan		vpshufb	xbyteswap, var_xdata, var_xdata
12622cddcc7Schandramouli narayanan		.set i, (i +1)
12722cddcc7Schandramouli narayanan	.endr
12822cddcc7Schandramouli narayanan
12922cddcc7Schandramouli narayanan	vmovdqa	1*16(p_keys), xkeyA
13022cddcc7Schandramouli narayanan
13122cddcc7Schandramouli narayanan	vpxor	xkey0, xdata0, xdata0
13244069737SJian Cai	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
13380dca473SMathias Krause	vptest	ddq_low_msk(%rip), xcounter
13480dca473SMathias Krause	jnz	1f
13580dca473SMathias Krause	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
13680dca473SMathias Krause	1:
13722cddcc7Schandramouli narayanan
13822cddcc7Schandramouli narayanan	.set i, 1
13922cddcc7Schandramouli narayanan	.rept (by - 1)
14022cddcc7Schandramouli narayanan		club XDATA, i
14122cddcc7Schandramouli narayanan		vpxor	xkey0, var_xdata, var_xdata
14222cddcc7Schandramouli narayanan		.set i, (i +1)
14322cddcc7Schandramouli narayanan	.endr
14422cddcc7Schandramouli narayanan
14522cddcc7Schandramouli narayanan	vmovdqa	2*16(p_keys), xkeyB
14622cddcc7Schandramouli narayanan
14722cddcc7Schandramouli narayanan	.set i, 0
14822cddcc7Schandramouli narayanan	.rept by
14922cddcc7Schandramouli narayanan		club XDATA, i
15022cddcc7Schandramouli narayanan		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
15122cddcc7Schandramouli narayanan		.set i, (i +1)
15222cddcc7Schandramouli narayanan	.endr
15322cddcc7Schandramouli narayanan
15422cddcc7Schandramouli narayanan	.if (klen == KEY_128)
15522cddcc7Schandramouli narayanan		.if (load_keys)
1560b1e95b2SMathias Krause			vmovdqa	3*16(p_keys), xkey4
15722cddcc7Schandramouli narayanan		.endif
15822cddcc7Schandramouli narayanan	.else
15922cddcc7Schandramouli narayanan		vmovdqa	3*16(p_keys), xkeyA
16022cddcc7Schandramouli narayanan	.endif
16122cddcc7Schandramouli narayanan
16222cddcc7Schandramouli narayanan	.set i, 0
16322cddcc7Schandramouli narayanan	.rept by
16422cddcc7Schandramouli narayanan		club XDATA, i
16522cddcc7Schandramouli narayanan		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
16622cddcc7Schandramouli narayanan		.set i, (i +1)
16722cddcc7Schandramouli narayanan	.endr
16822cddcc7Schandramouli narayanan
16922cddcc7Schandramouli narayanan	add	$(16*by), p_in
17022cddcc7Schandramouli narayanan
17122cddcc7Schandramouli narayanan	.if (klen == KEY_128)
1720b1e95b2SMathias Krause		vmovdqa	4*16(p_keys), xkeyB
17322cddcc7Schandramouli narayanan	.else
17422cddcc7Schandramouli narayanan		.if (load_keys)
17522cddcc7Schandramouli narayanan			vmovdqa	4*16(p_keys), xkey4
17622cddcc7Schandramouli narayanan		.endif
17722cddcc7Schandramouli narayanan	.endif
17822cddcc7Schandramouli narayanan
17922cddcc7Schandramouli narayanan	.set i, 0
18022cddcc7Schandramouli narayanan	.rept by
18122cddcc7Schandramouli narayanan		club XDATA, i
1820b1e95b2SMathias Krause		/* key 3 */
1830b1e95b2SMathias Krause		.if (klen == KEY_128)
1840b1e95b2SMathias Krause			vaesenc	xkey4, var_xdata, var_xdata
1850b1e95b2SMathias Krause		.else
1860b1e95b2SMathias Krause			vaesenc	xkeyA, var_xdata, var_xdata
1870b1e95b2SMathias Krause		.endif
18822cddcc7Schandramouli narayanan		.set i, (i +1)
18922cddcc7Schandramouli narayanan	.endr
19022cddcc7Schandramouli narayanan
19122cddcc7Schandramouli narayanan	vmovdqa	5*16(p_keys), xkeyA
19222cddcc7Schandramouli narayanan
19322cddcc7Schandramouli narayanan	.set i, 0
19422cddcc7Schandramouli narayanan	.rept by
19522cddcc7Schandramouli narayanan		club XDATA, i
1960b1e95b2SMathias Krause		/* key 4 */
1970b1e95b2SMathias Krause		.if (klen == KEY_128)
1980b1e95b2SMathias Krause			vaesenc	xkeyB, var_xdata, var_xdata
1990b1e95b2SMathias Krause		.else
2000b1e95b2SMathias Krause			vaesenc	xkey4, var_xdata, var_xdata
2010b1e95b2SMathias Krause		.endif
20222cddcc7Schandramouli narayanan		.set i, (i +1)
20322cddcc7Schandramouli narayanan	.endr
20422cddcc7Schandramouli narayanan
20522cddcc7Schandramouli narayanan	.if (klen == KEY_128)
20622cddcc7Schandramouli narayanan		.if (load_keys)
2070b1e95b2SMathias Krause			vmovdqa	6*16(p_keys), xkey8
20822cddcc7Schandramouli narayanan		.endif
20922cddcc7Schandramouli narayanan	.else
21022cddcc7Schandramouli narayanan		vmovdqa	6*16(p_keys), xkeyB
21122cddcc7Schandramouli narayanan	.endif
21222cddcc7Schandramouli narayanan
21322cddcc7Schandramouli narayanan	.set i, 0
21422cddcc7Schandramouli narayanan	.rept by
21522cddcc7Schandramouli narayanan		club XDATA, i
21622cddcc7Schandramouli narayanan		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
21722cddcc7Schandramouli narayanan		.set i, (i +1)
21822cddcc7Schandramouli narayanan	.endr
21922cddcc7Schandramouli narayanan
22022cddcc7Schandramouli narayanan	vmovdqa	7*16(p_keys), xkeyA
22122cddcc7Schandramouli narayanan
22222cddcc7Schandramouli narayanan	.set i, 0
22322cddcc7Schandramouli narayanan	.rept by
22422cddcc7Schandramouli narayanan		club XDATA, i
2250b1e95b2SMathias Krause		/* key 6 */
2260b1e95b2SMathias Krause		.if (klen == KEY_128)
2270b1e95b2SMathias Krause			vaesenc	xkey8, var_xdata, var_xdata
2280b1e95b2SMathias Krause		.else
2290b1e95b2SMathias Krause			vaesenc	xkeyB, var_xdata, var_xdata
2300b1e95b2SMathias Krause		.endif
23122cddcc7Schandramouli narayanan		.set i, (i +1)
23222cddcc7Schandramouli narayanan	.endr
23322cddcc7Schandramouli narayanan
23422cddcc7Schandramouli narayanan	.if (klen == KEY_128)
2350b1e95b2SMathias Krause		vmovdqa	8*16(p_keys), xkeyB
23622cddcc7Schandramouli narayanan	.else
23722cddcc7Schandramouli narayanan		.if (load_keys)
23822cddcc7Schandramouli narayanan			vmovdqa	8*16(p_keys), xkey8
23922cddcc7Schandramouli narayanan		.endif
24022cddcc7Schandramouli narayanan	.endif
24122cddcc7Schandramouli narayanan
24222cddcc7Schandramouli narayanan	.set i, 0
24322cddcc7Schandramouli narayanan	.rept by
24422cddcc7Schandramouli narayanan		club XDATA, i
24522cddcc7Schandramouli narayanan		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
24622cddcc7Schandramouli narayanan		.set i, (i +1)
24722cddcc7Schandramouli narayanan	.endr
24822cddcc7Schandramouli narayanan
24922cddcc7Schandramouli narayanan	.if (klen == KEY_128)
25022cddcc7Schandramouli narayanan		.if (load_keys)
2510b1e95b2SMathias Krause			vmovdqa	9*16(p_keys), xkey12
25222cddcc7Schandramouli narayanan		.endif
25322cddcc7Schandramouli narayanan	.else
25422cddcc7Schandramouli narayanan		vmovdqa	9*16(p_keys), xkeyA
25522cddcc7Schandramouli narayanan	.endif
25622cddcc7Schandramouli narayanan
25722cddcc7Schandramouli narayanan	.set i, 0
25822cddcc7Schandramouli narayanan	.rept by
25922cddcc7Schandramouli narayanan		club XDATA, i
2600b1e95b2SMathias Krause		/* key 8 */
2610b1e95b2SMathias Krause		.if (klen == KEY_128)
2620b1e95b2SMathias Krause			vaesenc	xkeyB, var_xdata, var_xdata
2630b1e95b2SMathias Krause		.else
2640b1e95b2SMathias Krause			vaesenc	xkey8, var_xdata, var_xdata
2650b1e95b2SMathias Krause		.endif
26622cddcc7Schandramouli narayanan		.set i, (i +1)
26722cddcc7Schandramouli narayanan	.endr
26822cddcc7Schandramouli narayanan
26922cddcc7Schandramouli narayanan	vmovdqa	10*16(p_keys), xkeyB
27022cddcc7Schandramouli narayanan
27122cddcc7Schandramouli narayanan	.set i, 0
27222cddcc7Schandramouli narayanan	.rept by
27322cddcc7Schandramouli narayanan		club XDATA, i
2740b1e95b2SMathias Krause		/* key 9 */
2750b1e95b2SMathias Krause		.if (klen == KEY_128)
2760b1e95b2SMathias Krause			vaesenc	xkey12, var_xdata, var_xdata
2770b1e95b2SMathias Krause		.else
2780b1e95b2SMathias Krause			vaesenc	xkeyA, var_xdata, var_xdata
2790b1e95b2SMathias Krause		.endif
28022cddcc7Schandramouli narayanan		.set i, (i +1)
28122cddcc7Schandramouli narayanan	.endr
28222cddcc7Schandramouli narayanan
28322cddcc7Schandramouli narayanan	.if (klen != KEY_128)
28422cddcc7Schandramouli narayanan		vmovdqa	11*16(p_keys), xkeyA
28522cddcc7Schandramouli narayanan	.endif
28622cddcc7Schandramouli narayanan
28722cddcc7Schandramouli narayanan	.set i, 0
28822cddcc7Schandramouli narayanan	.rept by
28922cddcc7Schandramouli narayanan		club XDATA, i
29022cddcc7Schandramouli narayanan		/* key 10 */
29122cddcc7Schandramouli narayanan		.if (klen == KEY_128)
29222cddcc7Schandramouli narayanan			vaesenclast	xkeyB, var_xdata, var_xdata
29322cddcc7Schandramouli narayanan		.else
29422cddcc7Schandramouli narayanan			vaesenc	xkeyB, var_xdata, var_xdata
29522cddcc7Schandramouli narayanan		.endif
29622cddcc7Schandramouli narayanan		.set i, (i +1)
29722cddcc7Schandramouli narayanan	.endr
29822cddcc7Schandramouli narayanan
29922cddcc7Schandramouli narayanan	.if (klen != KEY_128)
30022cddcc7Schandramouli narayanan		.if (load_keys)
30122cddcc7Schandramouli narayanan			vmovdqa	12*16(p_keys), xkey12
30222cddcc7Schandramouli narayanan		.endif
30322cddcc7Schandramouli narayanan
30422cddcc7Schandramouli narayanan		.set i, 0
30522cddcc7Schandramouli narayanan		.rept by
30622cddcc7Schandramouli narayanan			club XDATA, i
30722cddcc7Schandramouli narayanan			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
30822cddcc7Schandramouli narayanan			.set i, (i +1)
30922cddcc7Schandramouli narayanan		.endr
31022cddcc7Schandramouli narayanan
31122cddcc7Schandramouli narayanan		.if (klen == KEY_256)
31222cddcc7Schandramouli narayanan			vmovdqa	13*16(p_keys), xkeyA
31322cddcc7Schandramouli narayanan		.endif
31422cddcc7Schandramouli narayanan
31522cddcc7Schandramouli narayanan		.set i, 0
31622cddcc7Schandramouli narayanan		.rept by
31722cddcc7Schandramouli narayanan			club XDATA, i
31822cddcc7Schandramouli narayanan			.if (klen == KEY_256)
31922cddcc7Schandramouli narayanan				/* key 12 */
32022cddcc7Schandramouli narayanan				vaesenc	xkey12, var_xdata, var_xdata
32122cddcc7Schandramouli narayanan			.else
32222cddcc7Schandramouli narayanan				vaesenclast xkey12, var_xdata, var_xdata
32322cddcc7Schandramouli narayanan			.endif
32422cddcc7Schandramouli narayanan			.set i, (i +1)
32522cddcc7Schandramouli narayanan		.endr
32622cddcc7Schandramouli narayanan
32722cddcc7Schandramouli narayanan		.if (klen == KEY_256)
32822cddcc7Schandramouli narayanan			vmovdqa	14*16(p_keys), xkeyB
32922cddcc7Schandramouli narayanan
33022cddcc7Schandramouli narayanan			.set i, 0
33122cddcc7Schandramouli narayanan			.rept by
33222cddcc7Schandramouli narayanan				club XDATA, i
33322cddcc7Schandramouli narayanan				/* key 13 */
33422cddcc7Schandramouli narayanan				vaesenc	xkeyA, var_xdata, var_xdata
33522cddcc7Schandramouli narayanan				.set i, (i +1)
33622cddcc7Schandramouli narayanan			.endr
33722cddcc7Schandramouli narayanan
33822cddcc7Schandramouli narayanan			.set i, 0
33922cddcc7Schandramouli narayanan			.rept by
34022cddcc7Schandramouli narayanan				club XDATA, i
34122cddcc7Schandramouli narayanan				/* key 14 */
34222cddcc7Schandramouli narayanan				vaesenclast	xkeyB, var_xdata, var_xdata
34322cddcc7Schandramouli narayanan				.set i, (i +1)
34422cddcc7Schandramouli narayanan			.endr
34522cddcc7Schandramouli narayanan		.endif
34622cddcc7Schandramouli narayanan	.endif
34722cddcc7Schandramouli narayanan
34822cddcc7Schandramouli narayanan	.set i, 0
34922cddcc7Schandramouli narayanan	.rept (by / 2)
35022cddcc7Schandramouli narayanan		.set j, (i+1)
35122cddcc7Schandramouli narayanan		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
35222cddcc7Schandramouli narayanan		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
35322cddcc7Schandramouli narayanan		club XDATA, i
35422cddcc7Schandramouli narayanan		vpxor	xkeyA, var_xdata, var_xdata
35522cddcc7Schandramouli narayanan		club XDATA, j
35622cddcc7Schandramouli narayanan		vpxor	xkeyB, var_xdata, var_xdata
35722cddcc7Schandramouli narayanan		.set i, (i+2)
35822cddcc7Schandramouli narayanan	.endr
35922cddcc7Schandramouli narayanan
36022cddcc7Schandramouli narayanan	.if (i < by)
36122cddcc7Schandramouli narayanan		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
36222cddcc7Schandramouli narayanan		club XDATA, i
36322cddcc7Schandramouli narayanan		vpxor	xkeyA, var_xdata, var_xdata
36422cddcc7Schandramouli narayanan	.endif
36522cddcc7Schandramouli narayanan
36622cddcc7Schandramouli narayanan	.set i, 0
36722cddcc7Schandramouli narayanan	.rept by
36822cddcc7Schandramouli narayanan		club XDATA, i
36922cddcc7Schandramouli narayanan		VMOVDQ	var_xdata, i*16(p_out)
37022cddcc7Schandramouli narayanan		.set i, (i+1)
37122cddcc7Schandramouli narayanan	.endr
37222cddcc7Schandramouli narayanan.endm
37322cddcc7Schandramouli narayanan
37422cddcc7Schandramouli narayanan.macro do_aes_load val, key_len
37522cddcc7Schandramouli narayanan	do_aes \val, 1, \key_len
37622cddcc7Schandramouli narayanan.endm
37722cddcc7Schandramouli narayanan
37822cddcc7Schandramouli narayanan.macro do_aes_noload val, key_len
37922cddcc7Schandramouli narayanan	do_aes \val, 0, \key_len
38022cddcc7Schandramouli narayanan.endm
38122cddcc7Schandramouli narayanan
38222cddcc7Schandramouli narayanan/* main body of aes ctr load */
38322cddcc7Schandramouli narayanan
38422cddcc7Schandramouli narayanan.macro do_aes_ctrmain key_len
38522cddcc7Schandramouli narayanan	cmp	$16, num_bytes
38622cddcc7Schandramouli narayanan	jb	.Ldo_return2\key_len
38722cddcc7Schandramouli narayanan
38822cddcc7Schandramouli narayanan	vmovdqa	byteswap_const(%rip), xbyteswap
38922cddcc7Schandramouli narayanan	vmovdqu	(p_iv), xcounter
39022cddcc7Schandramouli narayanan	vpshufb	xbyteswap, xcounter, xcounter
39122cddcc7Schandramouli narayanan
39222cddcc7Schandramouli narayanan	mov	num_bytes, tmp
39322cddcc7Schandramouli narayanan	and	$(7*16), tmp
39422cddcc7Schandramouli narayanan	jz	.Lmult_of_8_blks\key_len
39522cddcc7Schandramouli narayanan
39622cddcc7Schandramouli narayanan	/* 1 <= tmp <= 7 */
39722cddcc7Schandramouli narayanan	cmp	$(4*16), tmp
39822cddcc7Schandramouli narayanan	jg	.Lgt4\key_len
39922cddcc7Schandramouli narayanan	je	.Leq4\key_len
40022cddcc7Schandramouli narayanan
40122cddcc7Schandramouli narayanan.Llt4\key_len:
40222cddcc7Schandramouli narayanan	cmp	$(2*16), tmp
40322cddcc7Schandramouli narayanan	jg	.Leq3\key_len
40422cddcc7Schandramouli narayanan	je	.Leq2\key_len
40522cddcc7Schandramouli narayanan
40622cddcc7Schandramouli narayanan.Leq1\key_len:
40722cddcc7Schandramouli narayanan	do_aes_load	1, \key_len
40822cddcc7Schandramouli narayanan	add	$(1*16), p_out
40922cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
41022cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
41122cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
41222cddcc7Schandramouli narayanan
41322cddcc7Schandramouli narayanan.Leq2\key_len:
41422cddcc7Schandramouli narayanan	do_aes_load	2, \key_len
41522cddcc7Schandramouli narayanan	add	$(2*16), p_out
41622cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
41722cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
41822cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
41922cddcc7Schandramouli narayanan
42022cddcc7Schandramouli narayanan
42122cddcc7Schandramouli narayanan.Leq3\key_len:
42222cddcc7Schandramouli narayanan	do_aes_load	3, \key_len
42322cddcc7Schandramouli narayanan	add	$(3*16), p_out
42422cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
42522cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
42622cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
42722cddcc7Schandramouli narayanan
42822cddcc7Schandramouli narayanan.Leq4\key_len:
42922cddcc7Schandramouli narayanan	do_aes_load	4, \key_len
43022cddcc7Schandramouli narayanan	add	$(4*16), p_out
43122cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
43222cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
43322cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
43422cddcc7Schandramouli narayanan
43522cddcc7Schandramouli narayanan.Lgt4\key_len:
43622cddcc7Schandramouli narayanan	cmp	$(6*16), tmp
43722cddcc7Schandramouli narayanan	jg	.Leq7\key_len
43822cddcc7Schandramouli narayanan	je	.Leq6\key_len
43922cddcc7Schandramouli narayanan
44022cddcc7Schandramouli narayanan.Leq5\key_len:
44122cddcc7Schandramouli narayanan	do_aes_load	5, \key_len
44222cddcc7Schandramouli narayanan	add	$(5*16), p_out
44322cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
44422cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
44522cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
44622cddcc7Schandramouli narayanan
44722cddcc7Schandramouli narayanan.Leq6\key_len:
44822cddcc7Schandramouli narayanan	do_aes_load	6, \key_len
44922cddcc7Schandramouli narayanan	add	$(6*16), p_out
45022cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
45122cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
45222cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
45322cddcc7Schandramouli narayanan
45422cddcc7Schandramouli narayanan.Leq7\key_len:
45522cddcc7Schandramouli narayanan	do_aes_load	7, \key_len
45622cddcc7Schandramouli narayanan	add	$(7*16), p_out
45722cddcc7Schandramouli narayanan	and	$(~7*16), num_bytes
45822cddcc7Schandramouli narayanan	jz	.Ldo_return2\key_len
45922cddcc7Schandramouli narayanan	jmp	.Lmain_loop2\key_len
46022cddcc7Schandramouli narayanan
46122cddcc7Schandramouli narayanan.Lmult_of_8_blks\key_len:
46222cddcc7Schandramouli narayanan	.if (\key_len != KEY_128)
46322cddcc7Schandramouli narayanan		vmovdqa	0*16(p_keys), xkey0
46422cddcc7Schandramouli narayanan		vmovdqa	4*16(p_keys), xkey4
46522cddcc7Schandramouli narayanan		vmovdqa	8*16(p_keys), xkey8
46622cddcc7Schandramouli narayanan		vmovdqa	12*16(p_keys), xkey12
46722cddcc7Schandramouli narayanan	.else
46822cddcc7Schandramouli narayanan		vmovdqa	0*16(p_keys), xkey0
46922cddcc7Schandramouli narayanan		vmovdqa	3*16(p_keys), xkey4
47022cddcc7Schandramouli narayanan		vmovdqa	6*16(p_keys), xkey8
47122cddcc7Schandramouli narayanan		vmovdqa	9*16(p_keys), xkey12
47222cddcc7Schandramouli narayanan	.endif
47322cddcc7Schandramouli narayanan.align 16
47422cddcc7Schandramouli narayanan.Lmain_loop2\key_len:
47522cddcc7Schandramouli narayanan	/* num_bytes is a multiple of 8 and >0 */
47622cddcc7Schandramouli narayanan	do_aes_noload	8, \key_len
47722cddcc7Schandramouli narayanan	add	$(8*16), p_out
47822cddcc7Schandramouli narayanan	sub	$(8*16), num_bytes
47922cddcc7Schandramouli narayanan	jne	.Lmain_loop2\key_len
48022cddcc7Schandramouli narayanan
48122cddcc7Schandramouli narayanan.Ldo_return2\key_len:
48222cddcc7Schandramouli narayanan	/* return updated IV */
48322cddcc7Schandramouli narayanan	vpshufb	xbyteswap, xcounter, xcounter
48422cddcc7Schandramouli narayanan	vmovdqu	xcounter, (p_iv)
485f94909ceSPeter Zijlstra	RET
48622cddcc7Schandramouli narayanan.endm
48722cddcc7Schandramouli narayanan
48822cddcc7Schandramouli narayanan/*
48922cddcc7Schandramouli narayanan * routine to do AES128 CTR enc/decrypt "by8"
49022cddcc7Schandramouli narayanan * XMM registers are clobbered.
49122cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level
49222cddcc7Schandramouli narayanan * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
49322cddcc7Schandramouli narayanan *			unsigned int num_bytes)
49422cddcc7Schandramouli narayanan */
4956dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_128_avx_by8)
49622cddcc7Schandramouli narayanan	/* call the aes main loop */
49722cddcc7Schandramouli narayanan	do_aes_ctrmain KEY_128
49822cddcc7Schandramouli narayanan
4996dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_128_avx_by8)
50022cddcc7Schandramouli narayanan
50122cddcc7Schandramouli narayanan/*
50222cddcc7Schandramouli narayanan * routine to do AES192 CTR enc/decrypt "by8"
50322cddcc7Schandramouli narayanan * XMM registers are clobbered.
50422cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level
50522cddcc7Schandramouli narayanan * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
50622cddcc7Schandramouli narayanan *			unsigned int num_bytes)
50722cddcc7Schandramouli narayanan */
5086dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_192_avx_by8)
50922cddcc7Schandramouli narayanan	/* call the aes main loop */
51022cddcc7Schandramouli narayanan	do_aes_ctrmain KEY_192
51122cddcc7Schandramouli narayanan
5126dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_192_avx_by8)
51322cddcc7Schandramouli narayanan
51422cddcc7Schandramouli narayanan/*
51522cddcc7Schandramouli narayanan * routine to do AES256 CTR enc/decrypt "by8"
51622cddcc7Schandramouli narayanan * XMM registers are clobbered.
51722cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level
51822cddcc7Schandramouli narayanan * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
51922cddcc7Schandramouli narayanan *			unsigned int num_bytes)
52022cddcc7Schandramouli narayanan */
5216dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_256_avx_by8)
52222cddcc7Schandramouli narayanan	/* call the aes main loop */
52322cddcc7Schandramouli narayanan	do_aes_ctrmain KEY_256
52422cddcc7Schandramouli narayanan
5256dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_256_avx_by8)
526