1*90be188bSNathan Huckleberry/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ 222cddcc7Schandramouli narayanan/* 3*90be188bSNathan Huckleberry * AES CTR mode by8 optimization with AVX instructions. (x86_64) 422cddcc7Schandramouli narayanan * 522cddcc7Schandramouli narayanan * Copyright(c) 2014 Intel Corporation. 622cddcc7Schandramouli narayanan * 722cddcc7Schandramouli narayanan * Contact Information: 822cddcc7Schandramouli narayanan * James Guilford <james.guilford@intel.com> 922cddcc7Schandramouli narayanan * Sean Gulley <sean.m.gulley@intel.com> 1022cddcc7Schandramouli narayanan * Chandramouli Narayanan <mouli@linux.intel.com> 11*90be188bSNathan Huckleberry */ 12*90be188bSNathan Huckleberry/* 13*90be188bSNathan Huckleberry * This is AES128/192/256 CTR mode optimization implementation. It requires 14*90be188bSNathan Huckleberry * the support of Intel(R) AESNI and AVX instructions. 1522cddcc7Schandramouli narayanan * 16*90be188bSNathan Huckleberry * This work was inspired by the AES CTR mode optimization published 17*90be188bSNathan Huckleberry * in Intel Optimized IPSEC Cryptographic library. 18*90be188bSNathan Huckleberry * Additional information on it can be found at: 19*90be188bSNathan Huckleberry * https://github.com/intel/intel-ipsec-mb 2022cddcc7Schandramouli narayanan */ 2122cddcc7Schandramouli narayanan 2222cddcc7Schandramouli narayanan#include <linux/linkage.h> 2322cddcc7Schandramouli narayanan 2422cddcc7Schandramouli narayanan#define VMOVDQ vmovdqu 2522cddcc7Schandramouli narayanan 2622cddcc7Schandramouli narayanan#define xdata0 %xmm0 2722cddcc7Schandramouli narayanan#define xdata1 %xmm1 2822cddcc7Schandramouli narayanan#define xdata2 %xmm2 2922cddcc7Schandramouli narayanan#define xdata3 %xmm3 3022cddcc7Schandramouli narayanan#define xdata4 %xmm4 3122cddcc7Schandramouli narayanan#define xdata5 %xmm5 3222cddcc7Schandramouli narayanan#define xdata6 %xmm6 3322cddcc7Schandramouli narayanan#define xdata7 %xmm7 3422cddcc7Schandramouli narayanan#define xcounter %xmm8 3522cddcc7Schandramouli narayanan#define xbyteswap %xmm9 3622cddcc7Schandramouli narayanan#define xkey0 %xmm10 3722cddcc7Schandramouli narayanan#define xkey4 %xmm11 3822cddcc7Schandramouli narayanan#define xkey8 %xmm12 3922cddcc7Schandramouli narayanan#define xkey12 %xmm13 4022cddcc7Schandramouli narayanan#define xkeyA %xmm14 4122cddcc7Schandramouli narayanan#define xkeyB %xmm15 4222cddcc7Schandramouli narayanan 4322cddcc7Schandramouli narayanan#define p_in %rdi 4422cddcc7Schandramouli narayanan#define p_iv %rsi 4522cddcc7Schandramouli narayanan#define p_keys %rdx 4622cddcc7Schandramouli narayanan#define p_out %rcx 4722cddcc7Schandramouli narayanan#define num_bytes %r8 4822cddcc7Schandramouli narayanan 4922cddcc7Schandramouli narayanan#define tmp %r10 5022cddcc7Schandramouli narayanan#define DDQ_DATA 0 5122cddcc7Schandramouli narayanan#define XDATA 1 5222cddcc7Schandramouli narayanan#define KEY_128 1 5322cddcc7Schandramouli narayanan#define KEY_192 2 5422cddcc7Schandramouli narayanan#define KEY_256 3 5522cddcc7Schandramouli narayanan 5622cddcc7Schandramouli narayanan.section .rodata 5722cddcc7Schandramouli narayanan.align 16 5822cddcc7Schandramouli narayanan 5922cddcc7Schandramouli narayananbyteswap_const: 6022cddcc7Schandramouli narayanan .octa 0x000102030405060708090A0B0C0D0E0F 6180dca473SMathias Krauseddq_low_msk: 6280dca473SMathias Krause .octa 0x0000000000000000FFFFFFFFFFFFFFFF 6380dca473SMathias Krauseddq_high_add_1: 6480dca473SMathias Krause .octa 0x00000000000000010000000000000000 6522cddcc7Schandramouli narayananddq_add_1: 6622cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000001 6722cddcc7Schandramouli narayananddq_add_2: 6822cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000002 6922cddcc7Schandramouli narayananddq_add_3: 7022cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000003 7122cddcc7Schandramouli narayananddq_add_4: 7222cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000004 7322cddcc7Schandramouli narayananddq_add_5: 7422cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000005 7522cddcc7Schandramouli narayananddq_add_6: 7622cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000006 7722cddcc7Schandramouli narayananddq_add_7: 7822cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000007 7922cddcc7Schandramouli narayananddq_add_8: 8022cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000008 8122cddcc7Schandramouli narayanan 8222cddcc7Schandramouli narayanan.text 8322cddcc7Schandramouli narayanan 8422cddcc7Schandramouli narayanan/* generate a unique variable for ddq_add_x */ 8522cddcc7Schandramouli narayanan 8622cddcc7Schandramouli narayanan/* generate a unique variable for xmm register */ 8722cddcc7Schandramouli narayanan.macro setxdata n 88fdb2726fSMichael Davidson var_xdata = %xmm\n 8922cddcc7Schandramouli narayanan.endm 9022cddcc7Schandramouli narayanan 9122cddcc7Schandramouli narayanan/* club the numeric 'id' to the symbol 'name' */ 9222cddcc7Schandramouli narayanan 9322cddcc7Schandramouli narayanan.macro club name, id 9422cddcc7Schandramouli narayanan.altmacro 9544069737SJian Cai .if \name == XDATA 9622cddcc7Schandramouli narayanan setxdata %\id 9722cddcc7Schandramouli narayanan .endif 9822cddcc7Schandramouli narayanan.noaltmacro 9922cddcc7Schandramouli narayanan.endm 10022cddcc7Schandramouli narayanan 10122cddcc7Schandramouli narayanan/* 10222cddcc7Schandramouli narayanan * do_aes num_in_par load_keys key_len 10322cddcc7Schandramouli narayanan * This increments p_in, but not p_out 10422cddcc7Schandramouli narayanan */ 10522cddcc7Schandramouli narayanan.macro do_aes b, k, key_len 10622cddcc7Schandramouli narayanan .set by, \b 10722cddcc7Schandramouli narayanan .set load_keys, \k 10822cddcc7Schandramouli narayanan .set klen, \key_len 10922cddcc7Schandramouli narayanan 11022cddcc7Schandramouli narayanan .if (load_keys) 11122cddcc7Schandramouli narayanan vmovdqa 0*16(p_keys), xkey0 11222cddcc7Schandramouli narayanan .endif 11322cddcc7Schandramouli narayanan 11422cddcc7Schandramouli narayanan vpshufb xbyteswap, xcounter, xdata0 11522cddcc7Schandramouli narayanan 11622cddcc7Schandramouli narayanan .set i, 1 11722cddcc7Schandramouli narayanan .rept (by - 1) 11822cddcc7Schandramouli narayanan club XDATA, i 11944069737SJian Cai vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 12080dca473SMathias Krause vptest ddq_low_msk(%rip), var_xdata 12180dca473SMathias Krause jnz 1f 12280dca473SMathias Krause vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 12380dca473SMathias Krause vpaddq ddq_high_add_1(%rip), xcounter, xcounter 12480dca473SMathias Krause 1: 12522cddcc7Schandramouli narayanan vpshufb xbyteswap, var_xdata, var_xdata 12622cddcc7Schandramouli narayanan .set i, (i +1) 12722cddcc7Schandramouli narayanan .endr 12822cddcc7Schandramouli narayanan 12922cddcc7Schandramouli narayanan vmovdqa 1*16(p_keys), xkeyA 13022cddcc7Schandramouli narayanan 13122cddcc7Schandramouli narayanan vpxor xkey0, xdata0, xdata0 13244069737SJian Cai vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 13380dca473SMathias Krause vptest ddq_low_msk(%rip), xcounter 13480dca473SMathias Krause jnz 1f 13580dca473SMathias Krause vpaddq ddq_high_add_1(%rip), xcounter, xcounter 13680dca473SMathias Krause 1: 13722cddcc7Schandramouli narayanan 13822cddcc7Schandramouli narayanan .set i, 1 13922cddcc7Schandramouli narayanan .rept (by - 1) 14022cddcc7Schandramouli narayanan club XDATA, i 14122cddcc7Schandramouli narayanan vpxor xkey0, var_xdata, var_xdata 14222cddcc7Schandramouli narayanan .set i, (i +1) 14322cddcc7Schandramouli narayanan .endr 14422cddcc7Schandramouli narayanan 14522cddcc7Schandramouli narayanan vmovdqa 2*16(p_keys), xkeyB 14622cddcc7Schandramouli narayanan 14722cddcc7Schandramouli narayanan .set i, 0 14822cddcc7Schandramouli narayanan .rept by 14922cddcc7Schandramouli narayanan club XDATA, i 15022cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 15122cddcc7Schandramouli narayanan .set i, (i +1) 15222cddcc7Schandramouli narayanan .endr 15322cddcc7Schandramouli narayanan 15422cddcc7Schandramouli narayanan .if (klen == KEY_128) 15522cddcc7Schandramouli narayanan .if (load_keys) 1560b1e95b2SMathias Krause vmovdqa 3*16(p_keys), xkey4 15722cddcc7Schandramouli narayanan .endif 15822cddcc7Schandramouli narayanan .else 15922cddcc7Schandramouli narayanan vmovdqa 3*16(p_keys), xkeyA 16022cddcc7Schandramouli narayanan .endif 16122cddcc7Schandramouli narayanan 16222cddcc7Schandramouli narayanan .set i, 0 16322cddcc7Schandramouli narayanan .rept by 16422cddcc7Schandramouli narayanan club XDATA, i 16522cddcc7Schandramouli narayanan vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 16622cddcc7Schandramouli narayanan .set i, (i +1) 16722cddcc7Schandramouli narayanan .endr 16822cddcc7Schandramouli narayanan 16922cddcc7Schandramouli narayanan add $(16*by), p_in 17022cddcc7Schandramouli narayanan 17122cddcc7Schandramouli narayanan .if (klen == KEY_128) 1720b1e95b2SMathias Krause vmovdqa 4*16(p_keys), xkeyB 17322cddcc7Schandramouli narayanan .else 17422cddcc7Schandramouli narayanan .if (load_keys) 17522cddcc7Schandramouli narayanan vmovdqa 4*16(p_keys), xkey4 17622cddcc7Schandramouli narayanan .endif 17722cddcc7Schandramouli narayanan .endif 17822cddcc7Schandramouli narayanan 17922cddcc7Schandramouli narayanan .set i, 0 18022cddcc7Schandramouli narayanan .rept by 18122cddcc7Schandramouli narayanan club XDATA, i 1820b1e95b2SMathias Krause /* key 3 */ 1830b1e95b2SMathias Krause .if (klen == KEY_128) 1840b1e95b2SMathias Krause vaesenc xkey4, var_xdata, var_xdata 1850b1e95b2SMathias Krause .else 1860b1e95b2SMathias Krause vaesenc xkeyA, var_xdata, var_xdata 1870b1e95b2SMathias Krause .endif 18822cddcc7Schandramouli narayanan .set i, (i +1) 18922cddcc7Schandramouli narayanan .endr 19022cddcc7Schandramouli narayanan 19122cddcc7Schandramouli narayanan vmovdqa 5*16(p_keys), xkeyA 19222cddcc7Schandramouli narayanan 19322cddcc7Schandramouli narayanan .set i, 0 19422cddcc7Schandramouli narayanan .rept by 19522cddcc7Schandramouli narayanan club XDATA, i 1960b1e95b2SMathias Krause /* key 4 */ 1970b1e95b2SMathias Krause .if (klen == KEY_128) 1980b1e95b2SMathias Krause vaesenc xkeyB, var_xdata, var_xdata 1990b1e95b2SMathias Krause .else 2000b1e95b2SMathias Krause vaesenc xkey4, var_xdata, var_xdata 2010b1e95b2SMathias Krause .endif 20222cddcc7Schandramouli narayanan .set i, (i +1) 20322cddcc7Schandramouli narayanan .endr 20422cddcc7Schandramouli narayanan 20522cddcc7Schandramouli narayanan .if (klen == KEY_128) 20622cddcc7Schandramouli narayanan .if (load_keys) 2070b1e95b2SMathias Krause vmovdqa 6*16(p_keys), xkey8 20822cddcc7Schandramouli narayanan .endif 20922cddcc7Schandramouli narayanan .else 21022cddcc7Schandramouli narayanan vmovdqa 6*16(p_keys), xkeyB 21122cddcc7Schandramouli narayanan .endif 21222cddcc7Schandramouli narayanan 21322cddcc7Schandramouli narayanan .set i, 0 21422cddcc7Schandramouli narayanan .rept by 21522cddcc7Schandramouli narayanan club XDATA, i 21622cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 21722cddcc7Schandramouli narayanan .set i, (i +1) 21822cddcc7Schandramouli narayanan .endr 21922cddcc7Schandramouli narayanan 22022cddcc7Schandramouli narayanan vmovdqa 7*16(p_keys), xkeyA 22122cddcc7Schandramouli narayanan 22222cddcc7Schandramouli narayanan .set i, 0 22322cddcc7Schandramouli narayanan .rept by 22422cddcc7Schandramouli narayanan club XDATA, i 2250b1e95b2SMathias Krause /* key 6 */ 2260b1e95b2SMathias Krause .if (klen == KEY_128) 2270b1e95b2SMathias Krause vaesenc xkey8, var_xdata, var_xdata 2280b1e95b2SMathias Krause .else 2290b1e95b2SMathias Krause vaesenc xkeyB, var_xdata, var_xdata 2300b1e95b2SMathias Krause .endif 23122cddcc7Schandramouli narayanan .set i, (i +1) 23222cddcc7Schandramouli narayanan .endr 23322cddcc7Schandramouli narayanan 23422cddcc7Schandramouli narayanan .if (klen == KEY_128) 2350b1e95b2SMathias Krause vmovdqa 8*16(p_keys), xkeyB 23622cddcc7Schandramouli narayanan .else 23722cddcc7Schandramouli narayanan .if (load_keys) 23822cddcc7Schandramouli narayanan vmovdqa 8*16(p_keys), xkey8 23922cddcc7Schandramouli narayanan .endif 24022cddcc7Schandramouli narayanan .endif 24122cddcc7Schandramouli narayanan 24222cddcc7Schandramouli narayanan .set i, 0 24322cddcc7Schandramouli narayanan .rept by 24422cddcc7Schandramouli narayanan club XDATA, i 24522cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 24622cddcc7Schandramouli narayanan .set i, (i +1) 24722cddcc7Schandramouli narayanan .endr 24822cddcc7Schandramouli narayanan 24922cddcc7Schandramouli narayanan .if (klen == KEY_128) 25022cddcc7Schandramouli narayanan .if (load_keys) 2510b1e95b2SMathias Krause vmovdqa 9*16(p_keys), xkey12 25222cddcc7Schandramouli narayanan .endif 25322cddcc7Schandramouli narayanan .else 25422cddcc7Schandramouli narayanan vmovdqa 9*16(p_keys), xkeyA 25522cddcc7Schandramouli narayanan .endif 25622cddcc7Schandramouli narayanan 25722cddcc7Schandramouli narayanan .set i, 0 25822cddcc7Schandramouli narayanan .rept by 25922cddcc7Schandramouli narayanan club XDATA, i 2600b1e95b2SMathias Krause /* key 8 */ 2610b1e95b2SMathias Krause .if (klen == KEY_128) 2620b1e95b2SMathias Krause vaesenc xkeyB, var_xdata, var_xdata 2630b1e95b2SMathias Krause .else 2640b1e95b2SMathias Krause vaesenc xkey8, var_xdata, var_xdata 2650b1e95b2SMathias Krause .endif 26622cddcc7Schandramouli narayanan .set i, (i +1) 26722cddcc7Schandramouli narayanan .endr 26822cddcc7Schandramouli narayanan 26922cddcc7Schandramouli narayanan vmovdqa 10*16(p_keys), xkeyB 27022cddcc7Schandramouli narayanan 27122cddcc7Schandramouli narayanan .set i, 0 27222cddcc7Schandramouli narayanan .rept by 27322cddcc7Schandramouli narayanan club XDATA, i 2740b1e95b2SMathias Krause /* key 9 */ 2750b1e95b2SMathias Krause .if (klen == KEY_128) 2760b1e95b2SMathias Krause vaesenc xkey12, var_xdata, var_xdata 2770b1e95b2SMathias Krause .else 2780b1e95b2SMathias Krause vaesenc xkeyA, var_xdata, var_xdata 2790b1e95b2SMathias Krause .endif 28022cddcc7Schandramouli narayanan .set i, (i +1) 28122cddcc7Schandramouli narayanan .endr 28222cddcc7Schandramouli narayanan 28322cddcc7Schandramouli narayanan .if (klen != KEY_128) 28422cddcc7Schandramouli narayanan vmovdqa 11*16(p_keys), xkeyA 28522cddcc7Schandramouli narayanan .endif 28622cddcc7Schandramouli narayanan 28722cddcc7Schandramouli narayanan .set i, 0 28822cddcc7Schandramouli narayanan .rept by 28922cddcc7Schandramouli narayanan club XDATA, i 29022cddcc7Schandramouli narayanan /* key 10 */ 29122cddcc7Schandramouli narayanan .if (klen == KEY_128) 29222cddcc7Schandramouli narayanan vaesenclast xkeyB, var_xdata, var_xdata 29322cddcc7Schandramouli narayanan .else 29422cddcc7Schandramouli narayanan vaesenc xkeyB, var_xdata, var_xdata 29522cddcc7Schandramouli narayanan .endif 29622cddcc7Schandramouli narayanan .set i, (i +1) 29722cddcc7Schandramouli narayanan .endr 29822cddcc7Schandramouli narayanan 29922cddcc7Schandramouli narayanan .if (klen != KEY_128) 30022cddcc7Schandramouli narayanan .if (load_keys) 30122cddcc7Schandramouli narayanan vmovdqa 12*16(p_keys), xkey12 30222cddcc7Schandramouli narayanan .endif 30322cddcc7Schandramouli narayanan 30422cddcc7Schandramouli narayanan .set i, 0 30522cddcc7Schandramouli narayanan .rept by 30622cddcc7Schandramouli narayanan club XDATA, i 30722cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 30822cddcc7Schandramouli narayanan .set i, (i +1) 30922cddcc7Schandramouli narayanan .endr 31022cddcc7Schandramouli narayanan 31122cddcc7Schandramouli narayanan .if (klen == KEY_256) 31222cddcc7Schandramouli narayanan vmovdqa 13*16(p_keys), xkeyA 31322cddcc7Schandramouli narayanan .endif 31422cddcc7Schandramouli narayanan 31522cddcc7Schandramouli narayanan .set i, 0 31622cddcc7Schandramouli narayanan .rept by 31722cddcc7Schandramouli narayanan club XDATA, i 31822cddcc7Schandramouli narayanan .if (klen == KEY_256) 31922cddcc7Schandramouli narayanan /* key 12 */ 32022cddcc7Schandramouli narayanan vaesenc xkey12, var_xdata, var_xdata 32122cddcc7Schandramouli narayanan .else 32222cddcc7Schandramouli narayanan vaesenclast xkey12, var_xdata, var_xdata 32322cddcc7Schandramouli narayanan .endif 32422cddcc7Schandramouli narayanan .set i, (i +1) 32522cddcc7Schandramouli narayanan .endr 32622cddcc7Schandramouli narayanan 32722cddcc7Schandramouli narayanan .if (klen == KEY_256) 32822cddcc7Schandramouli narayanan vmovdqa 14*16(p_keys), xkeyB 32922cddcc7Schandramouli narayanan 33022cddcc7Schandramouli narayanan .set i, 0 33122cddcc7Schandramouli narayanan .rept by 33222cddcc7Schandramouli narayanan club XDATA, i 33322cddcc7Schandramouli narayanan /* key 13 */ 33422cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata 33522cddcc7Schandramouli narayanan .set i, (i +1) 33622cddcc7Schandramouli narayanan .endr 33722cddcc7Schandramouli narayanan 33822cddcc7Schandramouli narayanan .set i, 0 33922cddcc7Schandramouli narayanan .rept by 34022cddcc7Schandramouli narayanan club XDATA, i 34122cddcc7Schandramouli narayanan /* key 14 */ 34222cddcc7Schandramouli narayanan vaesenclast xkeyB, var_xdata, var_xdata 34322cddcc7Schandramouli narayanan .set i, (i +1) 34422cddcc7Schandramouli narayanan .endr 34522cddcc7Schandramouli narayanan .endif 34622cddcc7Schandramouli narayanan .endif 34722cddcc7Schandramouli narayanan 34822cddcc7Schandramouli narayanan .set i, 0 34922cddcc7Schandramouli narayanan .rept (by / 2) 35022cddcc7Schandramouli narayanan .set j, (i+1) 35122cddcc7Schandramouli narayanan VMOVDQ (i*16 - 16*by)(p_in), xkeyA 35222cddcc7Schandramouli narayanan VMOVDQ (j*16 - 16*by)(p_in), xkeyB 35322cddcc7Schandramouli narayanan club XDATA, i 35422cddcc7Schandramouli narayanan vpxor xkeyA, var_xdata, var_xdata 35522cddcc7Schandramouli narayanan club XDATA, j 35622cddcc7Schandramouli narayanan vpxor xkeyB, var_xdata, var_xdata 35722cddcc7Schandramouli narayanan .set i, (i+2) 35822cddcc7Schandramouli narayanan .endr 35922cddcc7Schandramouli narayanan 36022cddcc7Schandramouli narayanan .if (i < by) 36122cddcc7Schandramouli narayanan VMOVDQ (i*16 - 16*by)(p_in), xkeyA 36222cddcc7Schandramouli narayanan club XDATA, i 36322cddcc7Schandramouli narayanan vpxor xkeyA, var_xdata, var_xdata 36422cddcc7Schandramouli narayanan .endif 36522cddcc7Schandramouli narayanan 36622cddcc7Schandramouli narayanan .set i, 0 36722cddcc7Schandramouli narayanan .rept by 36822cddcc7Schandramouli narayanan club XDATA, i 36922cddcc7Schandramouli narayanan VMOVDQ var_xdata, i*16(p_out) 37022cddcc7Schandramouli narayanan .set i, (i+1) 37122cddcc7Schandramouli narayanan .endr 37222cddcc7Schandramouli narayanan.endm 37322cddcc7Schandramouli narayanan 37422cddcc7Schandramouli narayanan.macro do_aes_load val, key_len 37522cddcc7Schandramouli narayanan do_aes \val, 1, \key_len 37622cddcc7Schandramouli narayanan.endm 37722cddcc7Schandramouli narayanan 37822cddcc7Schandramouli narayanan.macro do_aes_noload val, key_len 37922cddcc7Schandramouli narayanan do_aes \val, 0, \key_len 38022cddcc7Schandramouli narayanan.endm 38122cddcc7Schandramouli narayanan 38222cddcc7Schandramouli narayanan/* main body of aes ctr load */ 38322cddcc7Schandramouli narayanan 38422cddcc7Schandramouli narayanan.macro do_aes_ctrmain key_len 38522cddcc7Schandramouli narayanan cmp $16, num_bytes 38622cddcc7Schandramouli narayanan jb .Ldo_return2\key_len 38722cddcc7Schandramouli narayanan 38822cddcc7Schandramouli narayanan vmovdqa byteswap_const(%rip), xbyteswap 38922cddcc7Schandramouli narayanan vmovdqu (p_iv), xcounter 39022cddcc7Schandramouli narayanan vpshufb xbyteswap, xcounter, xcounter 39122cddcc7Schandramouli narayanan 39222cddcc7Schandramouli narayanan mov num_bytes, tmp 39322cddcc7Schandramouli narayanan and $(7*16), tmp 39422cddcc7Schandramouli narayanan jz .Lmult_of_8_blks\key_len 39522cddcc7Schandramouli narayanan 39622cddcc7Schandramouli narayanan /* 1 <= tmp <= 7 */ 39722cddcc7Schandramouli narayanan cmp $(4*16), tmp 39822cddcc7Schandramouli narayanan jg .Lgt4\key_len 39922cddcc7Schandramouli narayanan je .Leq4\key_len 40022cddcc7Schandramouli narayanan 40122cddcc7Schandramouli narayanan.Llt4\key_len: 40222cddcc7Schandramouli narayanan cmp $(2*16), tmp 40322cddcc7Schandramouli narayanan jg .Leq3\key_len 40422cddcc7Schandramouli narayanan je .Leq2\key_len 40522cddcc7Schandramouli narayanan 40622cddcc7Schandramouli narayanan.Leq1\key_len: 40722cddcc7Schandramouli narayanan do_aes_load 1, \key_len 40822cddcc7Schandramouli narayanan add $(1*16), p_out 40922cddcc7Schandramouli narayanan and $(~7*16), num_bytes 41022cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 41122cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 41222cddcc7Schandramouli narayanan 41322cddcc7Schandramouli narayanan.Leq2\key_len: 41422cddcc7Schandramouli narayanan do_aes_load 2, \key_len 41522cddcc7Schandramouli narayanan add $(2*16), p_out 41622cddcc7Schandramouli narayanan and $(~7*16), num_bytes 41722cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 41822cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 41922cddcc7Schandramouli narayanan 42022cddcc7Schandramouli narayanan 42122cddcc7Schandramouli narayanan.Leq3\key_len: 42222cddcc7Schandramouli narayanan do_aes_load 3, \key_len 42322cddcc7Schandramouli narayanan add $(3*16), p_out 42422cddcc7Schandramouli narayanan and $(~7*16), num_bytes 42522cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 42622cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 42722cddcc7Schandramouli narayanan 42822cddcc7Schandramouli narayanan.Leq4\key_len: 42922cddcc7Schandramouli narayanan do_aes_load 4, \key_len 43022cddcc7Schandramouli narayanan add $(4*16), p_out 43122cddcc7Schandramouli narayanan and $(~7*16), num_bytes 43222cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 43322cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 43422cddcc7Schandramouli narayanan 43522cddcc7Schandramouli narayanan.Lgt4\key_len: 43622cddcc7Schandramouli narayanan cmp $(6*16), tmp 43722cddcc7Schandramouli narayanan jg .Leq7\key_len 43822cddcc7Schandramouli narayanan je .Leq6\key_len 43922cddcc7Schandramouli narayanan 44022cddcc7Schandramouli narayanan.Leq5\key_len: 44122cddcc7Schandramouli narayanan do_aes_load 5, \key_len 44222cddcc7Schandramouli narayanan add $(5*16), p_out 44322cddcc7Schandramouli narayanan and $(~7*16), num_bytes 44422cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 44522cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 44622cddcc7Schandramouli narayanan 44722cddcc7Schandramouli narayanan.Leq6\key_len: 44822cddcc7Schandramouli narayanan do_aes_load 6, \key_len 44922cddcc7Schandramouli narayanan add $(6*16), p_out 45022cddcc7Schandramouli narayanan and $(~7*16), num_bytes 45122cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 45222cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 45322cddcc7Schandramouli narayanan 45422cddcc7Schandramouli narayanan.Leq7\key_len: 45522cddcc7Schandramouli narayanan do_aes_load 7, \key_len 45622cddcc7Schandramouli narayanan add $(7*16), p_out 45722cddcc7Schandramouli narayanan and $(~7*16), num_bytes 45822cddcc7Schandramouli narayanan jz .Ldo_return2\key_len 45922cddcc7Schandramouli narayanan jmp .Lmain_loop2\key_len 46022cddcc7Schandramouli narayanan 46122cddcc7Schandramouli narayanan.Lmult_of_8_blks\key_len: 46222cddcc7Schandramouli narayanan .if (\key_len != KEY_128) 46322cddcc7Schandramouli narayanan vmovdqa 0*16(p_keys), xkey0 46422cddcc7Schandramouli narayanan vmovdqa 4*16(p_keys), xkey4 46522cddcc7Schandramouli narayanan vmovdqa 8*16(p_keys), xkey8 46622cddcc7Schandramouli narayanan vmovdqa 12*16(p_keys), xkey12 46722cddcc7Schandramouli narayanan .else 46822cddcc7Schandramouli narayanan vmovdqa 0*16(p_keys), xkey0 46922cddcc7Schandramouli narayanan vmovdqa 3*16(p_keys), xkey4 47022cddcc7Schandramouli narayanan vmovdqa 6*16(p_keys), xkey8 47122cddcc7Schandramouli narayanan vmovdqa 9*16(p_keys), xkey12 47222cddcc7Schandramouli narayanan .endif 47322cddcc7Schandramouli narayanan.align 16 47422cddcc7Schandramouli narayanan.Lmain_loop2\key_len: 47522cddcc7Schandramouli narayanan /* num_bytes is a multiple of 8 and >0 */ 47622cddcc7Schandramouli narayanan do_aes_noload 8, \key_len 47722cddcc7Schandramouli narayanan add $(8*16), p_out 47822cddcc7Schandramouli narayanan sub $(8*16), num_bytes 47922cddcc7Schandramouli narayanan jne .Lmain_loop2\key_len 48022cddcc7Schandramouli narayanan 48122cddcc7Schandramouli narayanan.Ldo_return2\key_len: 48222cddcc7Schandramouli narayanan /* return updated IV */ 48322cddcc7Schandramouli narayanan vpshufb xbyteswap, xcounter, xcounter 48422cddcc7Schandramouli narayanan vmovdqu xcounter, (p_iv) 485f94909ceSPeter Zijlstra RET 48622cddcc7Schandramouli narayanan.endm 48722cddcc7Schandramouli narayanan 48822cddcc7Schandramouli narayanan/* 48922cddcc7Schandramouli narayanan * routine to do AES128 CTR enc/decrypt "by8" 49022cddcc7Schandramouli narayanan * XMM registers are clobbered. 49122cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level 49222cddcc7Schandramouli narayanan * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 49322cddcc7Schandramouli narayanan * unsigned int num_bytes) 49422cddcc7Schandramouli narayanan */ 4956dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_128_avx_by8) 49622cddcc7Schandramouli narayanan /* call the aes main loop */ 49722cddcc7Schandramouli narayanan do_aes_ctrmain KEY_128 49822cddcc7Schandramouli narayanan 4996dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_128_avx_by8) 50022cddcc7Schandramouli narayanan 50122cddcc7Schandramouli narayanan/* 50222cddcc7Schandramouli narayanan * routine to do AES192 CTR enc/decrypt "by8" 50322cddcc7Schandramouli narayanan * XMM registers are clobbered. 50422cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level 50522cddcc7Schandramouli narayanan * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 50622cddcc7Schandramouli narayanan * unsigned int num_bytes) 50722cddcc7Schandramouli narayanan */ 5086dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_192_avx_by8) 50922cddcc7Schandramouli narayanan /* call the aes main loop */ 51022cddcc7Schandramouli narayanan do_aes_ctrmain KEY_192 51122cddcc7Schandramouli narayanan 5126dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_192_avx_by8) 51322cddcc7Schandramouli narayanan 51422cddcc7Schandramouli narayanan/* 51522cddcc7Schandramouli narayanan * routine to do AES256 CTR enc/decrypt "by8" 51622cddcc7Schandramouli narayanan * XMM registers are clobbered. 51722cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level 51822cddcc7Schandramouli narayanan * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 51922cddcc7Schandramouli narayanan * unsigned int num_bytes) 52022cddcc7Schandramouli narayanan */ 5216dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_256_avx_by8) 52222cddcc7Schandramouli narayanan /* call the aes main loop */ 52322cddcc7Schandramouli narayanan do_aes_ctrmain KEY_256 52422cddcc7Schandramouli narayanan 5256dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_256_avx_by8) 526