190be188bSNathan Huckleberry/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ 222cddcc7Schandramouli narayanan/* 390be188bSNathan Huckleberry * AES CTR mode by8 optimization with AVX instructions. (x86_64) 422cddcc7Schandramouli narayanan * 522cddcc7Schandramouli narayanan * Copyright(c) 2014 Intel Corporation. 622cddcc7Schandramouli narayanan * 722cddcc7Schandramouli narayanan * Contact Information: 822cddcc7Schandramouli narayanan * James Guilford <james.guilford@intel.com> 922cddcc7Schandramouli narayanan * Sean Gulley <sean.m.gulley@intel.com> 1022cddcc7Schandramouli narayanan * Chandramouli Narayanan <mouli@linux.intel.com> 1190be188bSNathan Huckleberry */ 1290be188bSNathan Huckleberry/* 1390be188bSNathan Huckleberry * This is AES128/192/256 CTR mode optimization implementation. It requires 1490be188bSNathan Huckleberry * the support of Intel(R) AESNI and AVX instructions. 1522cddcc7Schandramouli narayanan * 1690be188bSNathan Huckleberry * This work was inspired by the AES CTR mode optimization published 1790be188bSNathan Huckleberry * in Intel Optimized IPSEC Cryptographic library. 1890be188bSNathan Huckleberry * Additional information on it can be found at: 1990be188bSNathan Huckleberry * https://github.com/intel/intel-ipsec-mb 2022cddcc7Schandramouli narayanan */ 2122cddcc7Schandramouli narayanan 2222cddcc7Schandramouli narayanan#include <linux/linkage.h> 2322cddcc7Schandramouli narayanan 2422cddcc7Schandramouli narayanan#define VMOVDQ vmovdqu 2522cddcc7Schandramouli narayanan 26*fd94fcf0SNathan Huckleberry/* 27*fd94fcf0SNathan Huckleberry * Note: the "x" prefix in these aliases means "this is an xmm register". The 28*fd94fcf0SNathan Huckleberry * alias prefixes have no relation to XCTR where the "X" prefix means "XOR 29*fd94fcf0SNathan Huckleberry * counter". 30*fd94fcf0SNathan Huckleberry */ 3122cddcc7Schandramouli narayanan#define xdata0 %xmm0 3222cddcc7Schandramouli narayanan#define xdata1 %xmm1 3322cddcc7Schandramouli narayanan#define xdata2 %xmm2 3422cddcc7Schandramouli narayanan#define xdata3 %xmm3 3522cddcc7Schandramouli narayanan#define xdata4 %xmm4 3622cddcc7Schandramouli narayanan#define xdata5 %xmm5 3722cddcc7Schandramouli narayanan#define xdata6 %xmm6 3822cddcc7Schandramouli narayanan#define xdata7 %xmm7 39*fd94fcf0SNathan Huckleberry#define xcounter %xmm8 // CTR mode only 40*fd94fcf0SNathan Huckleberry#define xiv %xmm8 // XCTR mode only 41*fd94fcf0SNathan Huckleberry#define xbyteswap %xmm9 // CTR mode only 42*fd94fcf0SNathan Huckleberry#define xtmp %xmm9 // XCTR mode only 4322cddcc7Schandramouli narayanan#define xkey0 %xmm10 4422cddcc7Schandramouli narayanan#define xkey4 %xmm11 4522cddcc7Schandramouli narayanan#define xkey8 %xmm12 4622cddcc7Schandramouli narayanan#define xkey12 %xmm13 4722cddcc7Schandramouli narayanan#define xkeyA %xmm14 4822cddcc7Schandramouli narayanan#define xkeyB %xmm15 4922cddcc7Schandramouli narayanan 5022cddcc7Schandramouli narayanan#define p_in %rdi 5122cddcc7Schandramouli narayanan#define p_iv %rsi 5222cddcc7Schandramouli narayanan#define p_keys %rdx 5322cddcc7Schandramouli narayanan#define p_out %rcx 5422cddcc7Schandramouli narayanan#define num_bytes %r8 55*fd94fcf0SNathan Huckleberry#define counter %r9 // XCTR mode only 5622cddcc7Schandramouli narayanan#define tmp %r10 5722cddcc7Schandramouli narayanan#define DDQ_DATA 0 5822cddcc7Schandramouli narayanan#define XDATA 1 5922cddcc7Schandramouli narayanan#define KEY_128 1 6022cddcc7Schandramouli narayanan#define KEY_192 2 6122cddcc7Schandramouli narayanan#define KEY_256 3 6222cddcc7Schandramouli narayanan 6322cddcc7Schandramouli narayanan.section .rodata 6422cddcc7Schandramouli narayanan.align 16 6522cddcc7Schandramouli narayanan 6622cddcc7Schandramouli narayananbyteswap_const: 6722cddcc7Schandramouli narayanan .octa 0x000102030405060708090A0B0C0D0E0F 6880dca473SMathias Krauseddq_low_msk: 6980dca473SMathias Krause .octa 0x0000000000000000FFFFFFFFFFFFFFFF 7080dca473SMathias Krauseddq_high_add_1: 7180dca473SMathias Krause .octa 0x00000000000000010000000000000000 7222cddcc7Schandramouli narayananddq_add_1: 7322cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000001 7422cddcc7Schandramouli narayananddq_add_2: 7522cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000002 7622cddcc7Schandramouli narayananddq_add_3: 7722cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000003 7822cddcc7Schandramouli narayananddq_add_4: 7922cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000004 8022cddcc7Schandramouli narayananddq_add_5: 8122cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000005 8222cddcc7Schandramouli narayananddq_add_6: 8322cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000006 8422cddcc7Schandramouli narayananddq_add_7: 8522cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000007 8622cddcc7Schandramouli narayananddq_add_8: 8722cddcc7Schandramouli narayanan .octa 0x00000000000000000000000000000008 8822cddcc7Schandramouli narayanan 8922cddcc7Schandramouli narayanan.text 9022cddcc7Schandramouli narayanan 9122cddcc7Schandramouli narayanan/* generate a unique variable for ddq_add_x */ 9222cddcc7Schandramouli narayanan 9322cddcc7Schandramouli narayanan/* generate a unique variable for xmm register */ 9422cddcc7Schandramouli narayanan.macro setxdata n 95fdb2726fSMichael Davidson var_xdata = %xmm\n 9622cddcc7Schandramouli narayanan.endm 9722cddcc7Schandramouli narayanan 9822cddcc7Schandramouli narayanan/* club the numeric 'id' to the symbol 'name' */ 9922cddcc7Schandramouli narayanan 10022cddcc7Schandramouli narayanan.macro club name, id 10122cddcc7Schandramouli narayanan.altmacro 10244069737SJian Cai .if \name == XDATA 10322cddcc7Schandramouli narayanan setxdata %\id 10422cddcc7Schandramouli narayanan .endif 10522cddcc7Schandramouli narayanan.noaltmacro 10622cddcc7Schandramouli narayanan.endm 10722cddcc7Schandramouli narayanan 10822cddcc7Schandramouli narayanan/* 10922cddcc7Schandramouli narayanan * do_aes num_in_par load_keys key_len 11022cddcc7Schandramouli narayanan * This increments p_in, but not p_out 11122cddcc7Schandramouli narayanan */ 112*fd94fcf0SNathan Huckleberry.macro do_aes b, k, key_len, xctr 11322cddcc7Schandramouli narayanan .set by, \b 11422cddcc7Schandramouli narayanan .set load_keys, \k 11522cddcc7Schandramouli narayanan .set klen, \key_len 11622cddcc7Schandramouli narayanan 11722cddcc7Schandramouli narayanan .if (load_keys) 11822cddcc7Schandramouli narayanan vmovdqa 0*16(p_keys), xkey0 11922cddcc7Schandramouli narayanan .endif 12022cddcc7Schandramouli narayanan 121*fd94fcf0SNathan Huckleberry .if \xctr 122*fd94fcf0SNathan Huckleberry movq counter, xtmp 123*fd94fcf0SNathan Huckleberry .set i, 0 124*fd94fcf0SNathan Huckleberry .rept (by) 125*fd94fcf0SNathan Huckleberry club XDATA, i 126*fd94fcf0SNathan Huckleberry vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata 127*fd94fcf0SNathan Huckleberry .set i, (i +1) 128*fd94fcf0SNathan Huckleberry .endr 129*fd94fcf0SNathan Huckleberry .set i, 0 130*fd94fcf0SNathan Huckleberry .rept (by) 131*fd94fcf0SNathan Huckleberry club XDATA, i 132*fd94fcf0SNathan Huckleberry vpxor xiv, var_xdata, var_xdata 133*fd94fcf0SNathan Huckleberry .set i, (i +1) 134*fd94fcf0SNathan Huckleberry .endr 135*fd94fcf0SNathan Huckleberry .else 13622cddcc7Schandramouli narayanan vpshufb xbyteswap, xcounter, xdata0 13722cddcc7Schandramouli narayanan .set i, 1 13822cddcc7Schandramouli narayanan .rept (by - 1) 13922cddcc7Schandramouli narayanan club XDATA, i 14044069737SJian Cai vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 14180dca473SMathias Krause vptest ddq_low_msk(%rip), var_xdata 14280dca473SMathias Krause jnz 1f 14380dca473SMathias Krause vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 14480dca473SMathias Krause vpaddq ddq_high_add_1(%rip), xcounter, xcounter 14580dca473SMathias Krause 1: 14622cddcc7Schandramouli narayanan vpshufb xbyteswap, var_xdata, var_xdata 14722cddcc7Schandramouli narayanan .set i, (i +1) 14822cddcc7Schandramouli narayanan .endr 149*fd94fcf0SNathan Huckleberry .endif 15022cddcc7Schandramouli narayanan 15122cddcc7Schandramouli narayanan vmovdqa 1*16(p_keys), xkeyA 15222cddcc7Schandramouli narayanan 15322cddcc7Schandramouli narayanan vpxor xkey0, xdata0, xdata0 154*fd94fcf0SNathan Huckleberry .if \xctr 155*fd94fcf0SNathan Huckleberry add $by, counter 156*fd94fcf0SNathan Huckleberry .else 15744069737SJian Cai vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 15880dca473SMathias Krause vptest ddq_low_msk(%rip), xcounter 15980dca473SMathias Krause jnz 1f 16080dca473SMathias Krause vpaddq ddq_high_add_1(%rip), xcounter, xcounter 16180dca473SMathias Krause 1: 162*fd94fcf0SNathan Huckleberry .endif 16322cddcc7Schandramouli narayanan 16422cddcc7Schandramouli narayanan .set i, 1 16522cddcc7Schandramouli narayanan .rept (by - 1) 16622cddcc7Schandramouli narayanan club XDATA, i 16722cddcc7Schandramouli narayanan vpxor xkey0, var_xdata, var_xdata 16822cddcc7Schandramouli narayanan .set i, (i +1) 16922cddcc7Schandramouli narayanan .endr 17022cddcc7Schandramouli narayanan 17122cddcc7Schandramouli narayanan vmovdqa 2*16(p_keys), xkeyB 17222cddcc7Schandramouli narayanan 17322cddcc7Schandramouli narayanan .set i, 0 17422cddcc7Schandramouli narayanan .rept by 17522cddcc7Schandramouli narayanan club XDATA, i 17622cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 17722cddcc7Schandramouli narayanan .set i, (i +1) 17822cddcc7Schandramouli narayanan .endr 17922cddcc7Schandramouli narayanan 18022cddcc7Schandramouli narayanan .if (klen == KEY_128) 18122cddcc7Schandramouli narayanan .if (load_keys) 1820b1e95b2SMathias Krause vmovdqa 3*16(p_keys), xkey4 18322cddcc7Schandramouli narayanan .endif 18422cddcc7Schandramouli narayanan .else 18522cddcc7Schandramouli narayanan vmovdqa 3*16(p_keys), xkeyA 18622cddcc7Schandramouli narayanan .endif 18722cddcc7Schandramouli narayanan 18822cddcc7Schandramouli narayanan .set i, 0 18922cddcc7Schandramouli narayanan .rept by 19022cddcc7Schandramouli narayanan club XDATA, i 19122cddcc7Schandramouli narayanan vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 19222cddcc7Schandramouli narayanan .set i, (i +1) 19322cddcc7Schandramouli narayanan .endr 19422cddcc7Schandramouli narayanan 19522cddcc7Schandramouli narayanan add $(16*by), p_in 19622cddcc7Schandramouli narayanan 19722cddcc7Schandramouli narayanan .if (klen == KEY_128) 1980b1e95b2SMathias Krause vmovdqa 4*16(p_keys), xkeyB 19922cddcc7Schandramouli narayanan .else 20022cddcc7Schandramouli narayanan .if (load_keys) 20122cddcc7Schandramouli narayanan vmovdqa 4*16(p_keys), xkey4 20222cddcc7Schandramouli narayanan .endif 20322cddcc7Schandramouli narayanan .endif 20422cddcc7Schandramouli narayanan 20522cddcc7Schandramouli narayanan .set i, 0 20622cddcc7Schandramouli narayanan .rept by 20722cddcc7Schandramouli narayanan club XDATA, i 2080b1e95b2SMathias Krause /* key 3 */ 2090b1e95b2SMathias Krause .if (klen == KEY_128) 2100b1e95b2SMathias Krause vaesenc xkey4, var_xdata, var_xdata 2110b1e95b2SMathias Krause .else 2120b1e95b2SMathias Krause vaesenc xkeyA, var_xdata, var_xdata 2130b1e95b2SMathias Krause .endif 21422cddcc7Schandramouli narayanan .set i, (i +1) 21522cddcc7Schandramouli narayanan .endr 21622cddcc7Schandramouli narayanan 21722cddcc7Schandramouli narayanan vmovdqa 5*16(p_keys), xkeyA 21822cddcc7Schandramouli narayanan 21922cddcc7Schandramouli narayanan .set i, 0 22022cddcc7Schandramouli narayanan .rept by 22122cddcc7Schandramouli narayanan club XDATA, i 2220b1e95b2SMathias Krause /* key 4 */ 2230b1e95b2SMathias Krause .if (klen == KEY_128) 2240b1e95b2SMathias Krause vaesenc xkeyB, var_xdata, var_xdata 2250b1e95b2SMathias Krause .else 2260b1e95b2SMathias Krause vaesenc xkey4, var_xdata, var_xdata 2270b1e95b2SMathias Krause .endif 22822cddcc7Schandramouli narayanan .set i, (i +1) 22922cddcc7Schandramouli narayanan .endr 23022cddcc7Schandramouli narayanan 23122cddcc7Schandramouli narayanan .if (klen == KEY_128) 23222cddcc7Schandramouli narayanan .if (load_keys) 2330b1e95b2SMathias Krause vmovdqa 6*16(p_keys), xkey8 23422cddcc7Schandramouli narayanan .endif 23522cddcc7Schandramouli narayanan .else 23622cddcc7Schandramouli narayanan vmovdqa 6*16(p_keys), xkeyB 23722cddcc7Schandramouli narayanan .endif 23822cddcc7Schandramouli narayanan 23922cddcc7Schandramouli narayanan .set i, 0 24022cddcc7Schandramouli narayanan .rept by 24122cddcc7Schandramouli narayanan club XDATA, i 24222cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 24322cddcc7Schandramouli narayanan .set i, (i +1) 24422cddcc7Schandramouli narayanan .endr 24522cddcc7Schandramouli narayanan 24622cddcc7Schandramouli narayanan vmovdqa 7*16(p_keys), xkeyA 24722cddcc7Schandramouli narayanan 24822cddcc7Schandramouli narayanan .set i, 0 24922cddcc7Schandramouli narayanan .rept by 25022cddcc7Schandramouli narayanan club XDATA, i 2510b1e95b2SMathias Krause /* key 6 */ 2520b1e95b2SMathias Krause .if (klen == KEY_128) 2530b1e95b2SMathias Krause vaesenc xkey8, var_xdata, var_xdata 2540b1e95b2SMathias Krause .else 2550b1e95b2SMathias Krause vaesenc xkeyB, var_xdata, var_xdata 2560b1e95b2SMathias Krause .endif 25722cddcc7Schandramouli narayanan .set i, (i +1) 25822cddcc7Schandramouli narayanan .endr 25922cddcc7Schandramouli narayanan 26022cddcc7Schandramouli narayanan .if (klen == KEY_128) 2610b1e95b2SMathias Krause vmovdqa 8*16(p_keys), xkeyB 26222cddcc7Schandramouli narayanan .else 26322cddcc7Schandramouli narayanan .if (load_keys) 26422cddcc7Schandramouli narayanan vmovdqa 8*16(p_keys), xkey8 26522cddcc7Schandramouli narayanan .endif 26622cddcc7Schandramouli narayanan .endif 26722cddcc7Schandramouli narayanan 26822cddcc7Schandramouli narayanan .set i, 0 26922cddcc7Schandramouli narayanan .rept by 27022cddcc7Schandramouli narayanan club XDATA, i 27122cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 27222cddcc7Schandramouli narayanan .set i, (i +1) 27322cddcc7Schandramouli narayanan .endr 27422cddcc7Schandramouli narayanan 27522cddcc7Schandramouli narayanan .if (klen == KEY_128) 27622cddcc7Schandramouli narayanan .if (load_keys) 2770b1e95b2SMathias Krause vmovdqa 9*16(p_keys), xkey12 27822cddcc7Schandramouli narayanan .endif 27922cddcc7Schandramouli narayanan .else 28022cddcc7Schandramouli narayanan vmovdqa 9*16(p_keys), xkeyA 28122cddcc7Schandramouli narayanan .endif 28222cddcc7Schandramouli narayanan 28322cddcc7Schandramouli narayanan .set i, 0 28422cddcc7Schandramouli narayanan .rept by 28522cddcc7Schandramouli narayanan club XDATA, i 2860b1e95b2SMathias Krause /* key 8 */ 2870b1e95b2SMathias Krause .if (klen == KEY_128) 2880b1e95b2SMathias Krause vaesenc xkeyB, var_xdata, var_xdata 2890b1e95b2SMathias Krause .else 2900b1e95b2SMathias Krause vaesenc xkey8, var_xdata, var_xdata 2910b1e95b2SMathias Krause .endif 29222cddcc7Schandramouli narayanan .set i, (i +1) 29322cddcc7Schandramouli narayanan .endr 29422cddcc7Schandramouli narayanan 29522cddcc7Schandramouli narayanan vmovdqa 10*16(p_keys), xkeyB 29622cddcc7Schandramouli narayanan 29722cddcc7Schandramouli narayanan .set i, 0 29822cddcc7Schandramouli narayanan .rept by 29922cddcc7Schandramouli narayanan club XDATA, i 3000b1e95b2SMathias Krause /* key 9 */ 3010b1e95b2SMathias Krause .if (klen == KEY_128) 3020b1e95b2SMathias Krause vaesenc xkey12, var_xdata, var_xdata 3030b1e95b2SMathias Krause .else 3040b1e95b2SMathias Krause vaesenc xkeyA, var_xdata, var_xdata 3050b1e95b2SMathias Krause .endif 30622cddcc7Schandramouli narayanan .set i, (i +1) 30722cddcc7Schandramouli narayanan .endr 30822cddcc7Schandramouli narayanan 30922cddcc7Schandramouli narayanan .if (klen != KEY_128) 31022cddcc7Schandramouli narayanan vmovdqa 11*16(p_keys), xkeyA 31122cddcc7Schandramouli narayanan .endif 31222cddcc7Schandramouli narayanan 31322cddcc7Schandramouli narayanan .set i, 0 31422cddcc7Schandramouli narayanan .rept by 31522cddcc7Schandramouli narayanan club XDATA, i 31622cddcc7Schandramouli narayanan /* key 10 */ 31722cddcc7Schandramouli narayanan .if (klen == KEY_128) 31822cddcc7Schandramouli narayanan vaesenclast xkeyB, var_xdata, var_xdata 31922cddcc7Schandramouli narayanan .else 32022cddcc7Schandramouli narayanan vaesenc xkeyB, var_xdata, var_xdata 32122cddcc7Schandramouli narayanan .endif 32222cddcc7Schandramouli narayanan .set i, (i +1) 32322cddcc7Schandramouli narayanan .endr 32422cddcc7Schandramouli narayanan 32522cddcc7Schandramouli narayanan .if (klen != KEY_128) 32622cddcc7Schandramouli narayanan .if (load_keys) 32722cddcc7Schandramouli narayanan vmovdqa 12*16(p_keys), xkey12 32822cddcc7Schandramouli narayanan .endif 32922cddcc7Schandramouli narayanan 33022cddcc7Schandramouli narayanan .set i, 0 33122cddcc7Schandramouli narayanan .rept by 33222cddcc7Schandramouli narayanan club XDATA, i 33322cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 33422cddcc7Schandramouli narayanan .set i, (i +1) 33522cddcc7Schandramouli narayanan .endr 33622cddcc7Schandramouli narayanan 33722cddcc7Schandramouli narayanan .if (klen == KEY_256) 33822cddcc7Schandramouli narayanan vmovdqa 13*16(p_keys), xkeyA 33922cddcc7Schandramouli narayanan .endif 34022cddcc7Schandramouli narayanan 34122cddcc7Schandramouli narayanan .set i, 0 34222cddcc7Schandramouli narayanan .rept by 34322cddcc7Schandramouli narayanan club XDATA, i 34422cddcc7Schandramouli narayanan .if (klen == KEY_256) 34522cddcc7Schandramouli narayanan /* key 12 */ 34622cddcc7Schandramouli narayanan vaesenc xkey12, var_xdata, var_xdata 34722cddcc7Schandramouli narayanan .else 34822cddcc7Schandramouli narayanan vaesenclast xkey12, var_xdata, var_xdata 34922cddcc7Schandramouli narayanan .endif 35022cddcc7Schandramouli narayanan .set i, (i +1) 35122cddcc7Schandramouli narayanan .endr 35222cddcc7Schandramouli narayanan 35322cddcc7Schandramouli narayanan .if (klen == KEY_256) 35422cddcc7Schandramouli narayanan vmovdqa 14*16(p_keys), xkeyB 35522cddcc7Schandramouli narayanan 35622cddcc7Schandramouli narayanan .set i, 0 35722cddcc7Schandramouli narayanan .rept by 35822cddcc7Schandramouli narayanan club XDATA, i 35922cddcc7Schandramouli narayanan /* key 13 */ 36022cddcc7Schandramouli narayanan vaesenc xkeyA, var_xdata, var_xdata 36122cddcc7Schandramouli narayanan .set i, (i +1) 36222cddcc7Schandramouli narayanan .endr 36322cddcc7Schandramouli narayanan 36422cddcc7Schandramouli narayanan .set i, 0 36522cddcc7Schandramouli narayanan .rept by 36622cddcc7Schandramouli narayanan club XDATA, i 36722cddcc7Schandramouli narayanan /* key 14 */ 36822cddcc7Schandramouli narayanan vaesenclast xkeyB, var_xdata, var_xdata 36922cddcc7Schandramouli narayanan .set i, (i +1) 37022cddcc7Schandramouli narayanan .endr 37122cddcc7Schandramouli narayanan .endif 37222cddcc7Schandramouli narayanan .endif 37322cddcc7Schandramouli narayanan 37422cddcc7Schandramouli narayanan .set i, 0 37522cddcc7Schandramouli narayanan .rept (by / 2) 37622cddcc7Schandramouli narayanan .set j, (i+1) 37722cddcc7Schandramouli narayanan VMOVDQ (i*16 - 16*by)(p_in), xkeyA 37822cddcc7Schandramouli narayanan VMOVDQ (j*16 - 16*by)(p_in), xkeyB 37922cddcc7Schandramouli narayanan club XDATA, i 38022cddcc7Schandramouli narayanan vpxor xkeyA, var_xdata, var_xdata 38122cddcc7Schandramouli narayanan club XDATA, j 38222cddcc7Schandramouli narayanan vpxor xkeyB, var_xdata, var_xdata 38322cddcc7Schandramouli narayanan .set i, (i+2) 38422cddcc7Schandramouli narayanan .endr 38522cddcc7Schandramouli narayanan 38622cddcc7Schandramouli narayanan .if (i < by) 38722cddcc7Schandramouli narayanan VMOVDQ (i*16 - 16*by)(p_in), xkeyA 38822cddcc7Schandramouli narayanan club XDATA, i 38922cddcc7Schandramouli narayanan vpxor xkeyA, var_xdata, var_xdata 39022cddcc7Schandramouli narayanan .endif 39122cddcc7Schandramouli narayanan 39222cddcc7Schandramouli narayanan .set i, 0 39322cddcc7Schandramouli narayanan .rept by 39422cddcc7Schandramouli narayanan club XDATA, i 39522cddcc7Schandramouli narayanan VMOVDQ var_xdata, i*16(p_out) 39622cddcc7Schandramouli narayanan .set i, (i+1) 39722cddcc7Schandramouli narayanan .endr 39822cddcc7Schandramouli narayanan.endm 39922cddcc7Schandramouli narayanan 400*fd94fcf0SNathan Huckleberry.macro do_aes_load val, key_len, xctr 401*fd94fcf0SNathan Huckleberry do_aes \val, 1, \key_len, \xctr 40222cddcc7Schandramouli narayanan.endm 40322cddcc7Schandramouli narayanan 404*fd94fcf0SNathan Huckleberry.macro do_aes_noload val, key_len, xctr 405*fd94fcf0SNathan Huckleberry do_aes \val, 0, \key_len, \xctr 40622cddcc7Schandramouli narayanan.endm 40722cddcc7Schandramouli narayanan 40822cddcc7Schandramouli narayanan/* main body of aes ctr load */ 40922cddcc7Schandramouli narayanan 410*fd94fcf0SNathan Huckleberry.macro do_aes_ctrmain key_len, xctr 41122cddcc7Schandramouli narayanan cmp $16, num_bytes 412*fd94fcf0SNathan Huckleberry jb .Ldo_return2\xctr\key_len 41322cddcc7Schandramouli narayanan 414*fd94fcf0SNathan Huckleberry .if \xctr 415*fd94fcf0SNathan Huckleberry shr $4, counter 416*fd94fcf0SNathan Huckleberry vmovdqu (p_iv), xiv 417*fd94fcf0SNathan Huckleberry .else 41822cddcc7Schandramouli narayanan vmovdqa byteswap_const(%rip), xbyteswap 41922cddcc7Schandramouli narayanan vmovdqu (p_iv), xcounter 42022cddcc7Schandramouli narayanan vpshufb xbyteswap, xcounter, xcounter 421*fd94fcf0SNathan Huckleberry .endif 42222cddcc7Schandramouli narayanan 42322cddcc7Schandramouli narayanan mov num_bytes, tmp 42422cddcc7Schandramouli narayanan and $(7*16), tmp 425*fd94fcf0SNathan Huckleberry jz .Lmult_of_8_blks\xctr\key_len 42622cddcc7Schandramouli narayanan 42722cddcc7Schandramouli narayanan /* 1 <= tmp <= 7 */ 42822cddcc7Schandramouli narayanan cmp $(4*16), tmp 429*fd94fcf0SNathan Huckleberry jg .Lgt4\xctr\key_len 430*fd94fcf0SNathan Huckleberry je .Leq4\xctr\key_len 43122cddcc7Schandramouli narayanan 432*fd94fcf0SNathan Huckleberry.Llt4\xctr\key_len: 43322cddcc7Schandramouli narayanan cmp $(2*16), tmp 434*fd94fcf0SNathan Huckleberry jg .Leq3\xctr\key_len 435*fd94fcf0SNathan Huckleberry je .Leq2\xctr\key_len 43622cddcc7Schandramouli narayanan 437*fd94fcf0SNathan Huckleberry.Leq1\xctr\key_len: 438*fd94fcf0SNathan Huckleberry do_aes_load 1, \key_len, \xctr 43922cddcc7Schandramouli narayanan add $(1*16), p_out 44022cddcc7Schandramouli narayanan and $(~7*16), num_bytes 441*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 442*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 44322cddcc7Schandramouli narayanan 444*fd94fcf0SNathan Huckleberry.Leq2\xctr\key_len: 445*fd94fcf0SNathan Huckleberry do_aes_load 2, \key_len, \xctr 44622cddcc7Schandramouli narayanan add $(2*16), p_out 44722cddcc7Schandramouli narayanan and $(~7*16), num_bytes 448*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 449*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 45022cddcc7Schandramouli narayanan 45122cddcc7Schandramouli narayanan 452*fd94fcf0SNathan Huckleberry.Leq3\xctr\key_len: 453*fd94fcf0SNathan Huckleberry do_aes_load 3, \key_len, \xctr 45422cddcc7Schandramouli narayanan add $(3*16), p_out 45522cddcc7Schandramouli narayanan and $(~7*16), num_bytes 456*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 457*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 45822cddcc7Schandramouli narayanan 459*fd94fcf0SNathan Huckleberry.Leq4\xctr\key_len: 460*fd94fcf0SNathan Huckleberry do_aes_load 4, \key_len, \xctr 46122cddcc7Schandramouli narayanan add $(4*16), p_out 46222cddcc7Schandramouli narayanan and $(~7*16), num_bytes 463*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 464*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 46522cddcc7Schandramouli narayanan 466*fd94fcf0SNathan Huckleberry.Lgt4\xctr\key_len: 46722cddcc7Schandramouli narayanan cmp $(6*16), tmp 468*fd94fcf0SNathan Huckleberry jg .Leq7\xctr\key_len 469*fd94fcf0SNathan Huckleberry je .Leq6\xctr\key_len 47022cddcc7Schandramouli narayanan 471*fd94fcf0SNathan Huckleberry.Leq5\xctr\key_len: 472*fd94fcf0SNathan Huckleberry do_aes_load 5, \key_len, \xctr 47322cddcc7Schandramouli narayanan add $(5*16), p_out 47422cddcc7Schandramouli narayanan and $(~7*16), num_bytes 475*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 476*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 47722cddcc7Schandramouli narayanan 478*fd94fcf0SNathan Huckleberry.Leq6\xctr\key_len: 479*fd94fcf0SNathan Huckleberry do_aes_load 6, \key_len, \xctr 48022cddcc7Schandramouli narayanan add $(6*16), p_out 48122cddcc7Schandramouli narayanan and $(~7*16), num_bytes 482*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 483*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 48422cddcc7Schandramouli narayanan 485*fd94fcf0SNathan Huckleberry.Leq7\xctr\key_len: 486*fd94fcf0SNathan Huckleberry do_aes_load 7, \key_len, \xctr 48722cddcc7Schandramouli narayanan add $(7*16), p_out 48822cddcc7Schandramouli narayanan and $(~7*16), num_bytes 489*fd94fcf0SNathan Huckleberry jz .Ldo_return2\xctr\key_len 490*fd94fcf0SNathan Huckleberry jmp .Lmain_loop2\xctr\key_len 49122cddcc7Schandramouli narayanan 492*fd94fcf0SNathan Huckleberry.Lmult_of_8_blks\xctr\key_len: 49322cddcc7Schandramouli narayanan .if (\key_len != KEY_128) 49422cddcc7Schandramouli narayanan vmovdqa 0*16(p_keys), xkey0 49522cddcc7Schandramouli narayanan vmovdqa 4*16(p_keys), xkey4 49622cddcc7Schandramouli narayanan vmovdqa 8*16(p_keys), xkey8 49722cddcc7Schandramouli narayanan vmovdqa 12*16(p_keys), xkey12 49822cddcc7Schandramouli narayanan .else 49922cddcc7Schandramouli narayanan vmovdqa 0*16(p_keys), xkey0 50022cddcc7Schandramouli narayanan vmovdqa 3*16(p_keys), xkey4 50122cddcc7Schandramouli narayanan vmovdqa 6*16(p_keys), xkey8 50222cddcc7Schandramouli narayanan vmovdqa 9*16(p_keys), xkey12 50322cddcc7Schandramouli narayanan .endif 50422cddcc7Schandramouli narayanan.align 16 505*fd94fcf0SNathan Huckleberry.Lmain_loop2\xctr\key_len: 50622cddcc7Schandramouli narayanan /* num_bytes is a multiple of 8 and >0 */ 507*fd94fcf0SNathan Huckleberry do_aes_noload 8, \key_len, \xctr 50822cddcc7Schandramouli narayanan add $(8*16), p_out 50922cddcc7Schandramouli narayanan sub $(8*16), num_bytes 510*fd94fcf0SNathan Huckleberry jne .Lmain_loop2\xctr\key_len 51122cddcc7Schandramouli narayanan 512*fd94fcf0SNathan Huckleberry.Ldo_return2\xctr\key_len: 513*fd94fcf0SNathan Huckleberry .if !\xctr 51422cddcc7Schandramouli narayanan /* return updated IV */ 51522cddcc7Schandramouli narayanan vpshufb xbyteswap, xcounter, xcounter 51622cddcc7Schandramouli narayanan vmovdqu xcounter, (p_iv) 517*fd94fcf0SNathan Huckleberry .endif 518f94909ceSPeter Zijlstra RET 51922cddcc7Schandramouli narayanan.endm 52022cddcc7Schandramouli narayanan 52122cddcc7Schandramouli narayanan/* 52222cddcc7Schandramouli narayanan * routine to do AES128 CTR enc/decrypt "by8" 52322cddcc7Schandramouli narayanan * XMM registers are clobbered. 52422cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level 52522cddcc7Schandramouli narayanan * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 52622cddcc7Schandramouli narayanan * unsigned int num_bytes) 52722cddcc7Schandramouli narayanan */ 5286dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_128_avx_by8) 52922cddcc7Schandramouli narayanan /* call the aes main loop */ 530*fd94fcf0SNathan Huckleberry do_aes_ctrmain KEY_128 0 53122cddcc7Schandramouli narayanan 5326dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_128_avx_by8) 53322cddcc7Schandramouli narayanan 53422cddcc7Schandramouli narayanan/* 53522cddcc7Schandramouli narayanan * routine to do AES192 CTR enc/decrypt "by8" 53622cddcc7Schandramouli narayanan * XMM registers are clobbered. 53722cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level 53822cddcc7Schandramouli narayanan * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 53922cddcc7Schandramouli narayanan * unsigned int num_bytes) 54022cddcc7Schandramouli narayanan */ 5416dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_192_avx_by8) 54222cddcc7Schandramouli narayanan /* call the aes main loop */ 543*fd94fcf0SNathan Huckleberry do_aes_ctrmain KEY_192 0 54422cddcc7Schandramouli narayanan 5456dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_192_avx_by8) 54622cddcc7Schandramouli narayanan 54722cddcc7Schandramouli narayanan/* 54822cddcc7Schandramouli narayanan * routine to do AES256 CTR enc/decrypt "by8" 54922cddcc7Schandramouli narayanan * XMM registers are clobbered. 55022cddcc7Schandramouli narayanan * Saving/restoring must be done at a higher level 55122cddcc7Schandramouli narayanan * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 55222cddcc7Schandramouli narayanan * unsigned int num_bytes) 55322cddcc7Schandramouli narayanan */ 5546dcc5627SJiri SlabySYM_FUNC_START(aes_ctr_enc_256_avx_by8) 55522cddcc7Schandramouli narayanan /* call the aes main loop */ 556*fd94fcf0SNathan Huckleberry do_aes_ctrmain KEY_256 0 55722cddcc7Schandramouli narayanan 5586dcc5627SJiri SlabySYM_FUNC_END(aes_ctr_enc_256_avx_by8) 559*fd94fcf0SNathan Huckleberry 560*fd94fcf0SNathan Huckleberry/* 561*fd94fcf0SNathan Huckleberry * routine to do AES128 XCTR enc/decrypt "by8" 562*fd94fcf0SNathan Huckleberry * XMM registers are clobbered. 563*fd94fcf0SNathan Huckleberry * Saving/restoring must be done at a higher level 564*fd94fcf0SNathan Huckleberry * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, 565*fd94fcf0SNathan Huckleberry * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 566*fd94fcf0SNathan Huckleberry */ 567*fd94fcf0SNathan HuckleberrySYM_FUNC_START(aes_xctr_enc_128_avx_by8) 568*fd94fcf0SNathan Huckleberry /* call the aes main loop */ 569*fd94fcf0SNathan Huckleberry do_aes_ctrmain KEY_128 1 570*fd94fcf0SNathan Huckleberry 571*fd94fcf0SNathan HuckleberrySYM_FUNC_END(aes_xctr_enc_128_avx_by8) 572*fd94fcf0SNathan Huckleberry 573*fd94fcf0SNathan Huckleberry/* 574*fd94fcf0SNathan Huckleberry * routine to do AES192 XCTR enc/decrypt "by8" 575*fd94fcf0SNathan Huckleberry * XMM registers are clobbered. 576*fd94fcf0SNathan Huckleberry * Saving/restoring must be done at a higher level 577*fd94fcf0SNathan Huckleberry * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, 578*fd94fcf0SNathan Huckleberry * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 579*fd94fcf0SNathan Huckleberry */ 580*fd94fcf0SNathan HuckleberrySYM_FUNC_START(aes_xctr_enc_192_avx_by8) 581*fd94fcf0SNathan Huckleberry /* call the aes main loop */ 582*fd94fcf0SNathan Huckleberry do_aes_ctrmain KEY_192 1 583*fd94fcf0SNathan Huckleberry 584*fd94fcf0SNathan HuckleberrySYM_FUNC_END(aes_xctr_enc_192_avx_by8) 585*fd94fcf0SNathan Huckleberry 586*fd94fcf0SNathan Huckleberry/* 587*fd94fcf0SNathan Huckleberry * routine to do AES256 XCTR enc/decrypt "by8" 588*fd94fcf0SNathan Huckleberry * XMM registers are clobbered. 589*fd94fcf0SNathan Huckleberry * Saving/restoring must be done at a higher level 590*fd94fcf0SNathan Huckleberry * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, 591*fd94fcf0SNathan Huckleberry * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 592*fd94fcf0SNathan Huckleberry */ 593*fd94fcf0SNathan HuckleberrySYM_FUNC_START(aes_xctr_enc_256_avx_by8) 594*fd94fcf0SNathan Huckleberry /* call the aes main loop */ 595*fd94fcf0SNathan Huckleberry do_aes_ctrmain KEY_256 1 596*fd94fcf0SNathan Huckleberry 597*fd94fcf0SNathan HuckleberrySYM_FUNC_END(aes_xctr_enc_256_avx_by8) 598