1d2912cb1SThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-only */ 20e1227d3SHuang Ying/* 30e1227d3SHuang Ying * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 40e1227d3SHuang Ying * instructions. This file contains accelerated part of ghash 50e1227d3SHuang Ying * implementation. More information about PCLMULQDQ can be found at: 60e1227d3SHuang Ying * 7750426d6SEric Biggers * https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf 80e1227d3SHuang Ying * 90e1227d3SHuang Ying * Copyright (c) 2009 Intel Corp. 100e1227d3SHuang Ying * Author: Huang Ying <ying.huang@intel.com> 110e1227d3SHuang Ying * Vinodh Gopal 120e1227d3SHuang Ying * Erdinc Ozturk 130e1227d3SHuang Ying * Deniz Karakoyunlu 140e1227d3SHuang Ying */ 150e1227d3SHuang Ying 160e1227d3SHuang Ying#include <linux/linkage.h> 178691ccd7SJosh Poimboeuf#include <asm/frame.h> 180e1227d3SHuang Ying 19e183914aSDenys Vlasenko.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 200e1227d3SHuang Ying.align 16 210e1227d3SHuang Ying.Lbswap_mask: 220e1227d3SHuang Ying .octa 0x000102030405060708090a0b0c0d0e0f 230e1227d3SHuang Ying 240e1227d3SHuang Ying#define DATA %xmm0 250e1227d3SHuang Ying#define SHASH %xmm1 260e1227d3SHuang Ying#define T1 %xmm2 270e1227d3SHuang Ying#define T2 %xmm3 280e1227d3SHuang Ying#define T3 %xmm4 290e1227d3SHuang Ying#define BSWAP %xmm5 300e1227d3SHuang Ying#define IN1 %xmm6 310e1227d3SHuang Ying 320e1227d3SHuang Ying.text 330e1227d3SHuang Ying 340e1227d3SHuang Ying/* 350e1227d3SHuang Ying * __clmul_gf128mul_ble: internal ABI 360e1227d3SHuang Ying * input: 370e1227d3SHuang Ying * DATA: operand1 380e1227d3SHuang Ying * SHASH: operand2, hash_key << 1 mod poly 390e1227d3SHuang Ying * output: 400e1227d3SHuang Ying * DATA: operand1 * operand2 mod poly 410e1227d3SHuang Ying * changed: 420e1227d3SHuang Ying * T1 430e1227d3SHuang Ying * T2 440e1227d3SHuang Ying * T3 450e1227d3SHuang Ying */ 4674d8b90aSJiri SlabySYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) 470e1227d3SHuang Ying movaps DATA, T1 480e1227d3SHuang Ying pshufd $0b01001110, DATA, T2 490e1227d3SHuang Ying pshufd $0b01001110, SHASH, T3 500e1227d3SHuang Ying pxor DATA, T2 510e1227d3SHuang Ying pxor SHASH, T3 520e1227d3SHuang Ying 53d7866e50SUros Bizjak pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 54d7866e50SUros Bizjak pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 55d7866e50SUros Bizjak pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) 560e1227d3SHuang Ying pxor DATA, T2 570e1227d3SHuang Ying pxor T1, T2 # T2 = a0 * b1 + a1 * b0 580e1227d3SHuang Ying 590e1227d3SHuang Ying movaps T2, T3 600e1227d3SHuang Ying pslldq $8, T3 610e1227d3SHuang Ying psrldq $8, T2 620e1227d3SHuang Ying pxor T3, DATA 630e1227d3SHuang Ying pxor T2, T1 # <T1:DATA> is result of 640e1227d3SHuang Ying # carry-less multiplication 650e1227d3SHuang Ying 660e1227d3SHuang Ying # first phase of the reduction 670e1227d3SHuang Ying movaps DATA, T3 680e1227d3SHuang Ying psllq $1, T3 690e1227d3SHuang Ying pxor DATA, T3 700e1227d3SHuang Ying psllq $5, T3 710e1227d3SHuang Ying pxor DATA, T3 720e1227d3SHuang Ying psllq $57, T3 730e1227d3SHuang Ying movaps T3, T2 740e1227d3SHuang Ying pslldq $8, T2 750e1227d3SHuang Ying psrldq $8, T3 760e1227d3SHuang Ying pxor T2, DATA 770e1227d3SHuang Ying pxor T3, T1 780e1227d3SHuang Ying 790e1227d3SHuang Ying # second phase of the reduction 800e1227d3SHuang Ying movaps DATA, T2 810e1227d3SHuang Ying psrlq $5, T2 820e1227d3SHuang Ying pxor DATA, T2 830e1227d3SHuang Ying psrlq $1, T2 840e1227d3SHuang Ying pxor DATA, T2 850e1227d3SHuang Ying psrlq $1, T2 860e1227d3SHuang Ying pxor T2, T1 870e1227d3SHuang Ying pxor T1, DATA 88f94909ceSPeter Zijlstra RET 8974d8b90aSJiri SlabySYM_FUNC_END(__clmul_gf128mul_ble) 900e1227d3SHuang Ying 91f1740751SEric Biggers/* void clmul_ghash_mul(char *dst, const le128 *shash) */ 926dcc5627SJiri SlabySYM_FUNC_START(clmul_ghash_mul) 938691ccd7SJosh Poimboeuf FRAME_BEGIN 940e1227d3SHuang Ying movups (%rdi), DATA 950e1227d3SHuang Ying movups (%rsi), SHASH 96*c41672b9SArd Biesheuvel movaps .Lbswap_mask(%rip), BSWAP 97d7866e50SUros Bizjak pshufb BSWAP, DATA 980e1227d3SHuang Ying call __clmul_gf128mul_ble 99d7866e50SUros Bizjak pshufb BSWAP, DATA 1000e1227d3SHuang Ying movups DATA, (%rdi) 1018691ccd7SJosh Poimboeuf FRAME_END 102f94909ceSPeter Zijlstra RET 1036dcc5627SJiri SlabySYM_FUNC_END(clmul_ghash_mul) 1040e1227d3SHuang Ying 1050e1227d3SHuang Ying/* 1060e1227d3SHuang Ying * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 107f1740751SEric Biggers * const le128 *shash); 1080e1227d3SHuang Ying */ 1096dcc5627SJiri SlabySYM_FUNC_START(clmul_ghash_update) 1108691ccd7SJosh Poimboeuf FRAME_BEGIN 1110e1227d3SHuang Ying cmp $16, %rdx 1120e1227d3SHuang Ying jb .Lupdate_just_ret # check length 113*c41672b9SArd Biesheuvel movaps .Lbswap_mask(%rip), BSWAP 1140e1227d3SHuang Ying movups (%rdi), DATA 1150e1227d3SHuang Ying movups (%rcx), SHASH 116d7866e50SUros Bizjak pshufb BSWAP, DATA 1170e1227d3SHuang Ying.align 4 1180e1227d3SHuang Ying.Lupdate_loop: 1190e1227d3SHuang Ying movups (%rsi), IN1 120d7866e50SUros Bizjak pshufb BSWAP, IN1 1210e1227d3SHuang Ying pxor IN1, DATA 1220e1227d3SHuang Ying call __clmul_gf128mul_ble 1230e1227d3SHuang Ying sub $16, %rdx 1240e1227d3SHuang Ying add $16, %rsi 1250e1227d3SHuang Ying cmp $16, %rdx 1260e1227d3SHuang Ying jge .Lupdate_loop 127d7866e50SUros Bizjak pshufb BSWAP, DATA 1280e1227d3SHuang Ying movups DATA, (%rdi) 1290e1227d3SHuang Ying.Lupdate_just_ret: 1308691ccd7SJosh Poimboeuf FRAME_END 131f94909ceSPeter Zijlstra RET 1326dcc5627SJiri SlabySYM_FUNC_END(clmul_ghash_update) 133