1/* 2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 3 * instructions. This file contains accelerated part of ghash 4 * implementation. More information about PCLMULQDQ can be found at: 5 * 6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 7 * 8 * Copyright (c) 2009 Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal 11 * Erdinc Ozturk 12 * Deniz Karakoyunlu 13 * 14 * This program is free software; you can redistribute it and/or modify it 15 * under the terms of the GNU General Public License version 2 as published 16 * by the Free Software Foundation. 17 */ 18 19#include <linux/linkage.h> 20#include <asm/inst.h> 21 22.data 23 24.align 16 25.Lbswap_mask: 26 .octa 0x000102030405060708090a0b0c0d0e0f 27 28#define DATA %xmm0 29#define SHASH %xmm1 30#define T1 %xmm2 31#define T2 %xmm3 32#define T3 %xmm4 33#define BSWAP %xmm5 34#define IN1 %xmm6 35 36.text 37 38/* 39 * __clmul_gf128mul_ble: internal ABI 40 * input: 41 * DATA: operand1 42 * SHASH: operand2, hash_key << 1 mod poly 43 * output: 44 * DATA: operand1 * operand2 mod poly 45 * changed: 46 * T1 47 * T2 48 * T3 49 */ 50__clmul_gf128mul_ble: 51 movaps DATA, T1 52 pshufd $0b01001110, DATA, T2 53 pshufd $0b01001110, SHASH, T3 54 pxor DATA, T2 55 pxor SHASH, T3 56 57 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 58 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 59 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) 60 pxor DATA, T2 61 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 62 63 movaps T2, T3 64 pslldq $8, T3 65 psrldq $8, T2 66 pxor T3, DATA 67 pxor T2, T1 # <T1:DATA> is result of 68 # carry-less multiplication 69 70 # first phase of the reduction 71 movaps DATA, T3 72 psllq $1, T3 73 pxor DATA, T3 74 psllq $5, T3 75 pxor DATA, T3 76 psllq $57, T3 77 movaps T3, T2 78 pslldq $8, T2 79 psrldq $8, T3 80 pxor T2, DATA 81 pxor T3, T1 82 83 # second phase of the reduction 84 movaps DATA, T2 85 psrlq $5, T2 86 pxor DATA, T2 87 psrlq $1, T2 88 pxor DATA, T2 89 psrlq $1, T2 90 pxor T2, T1 91 pxor T1, DATA 92 ret 93ENDPROC(__clmul_gf128mul_ble) 94 95/* void clmul_ghash_mul(char *dst, const u128 *shash) */ 96ENTRY(clmul_ghash_mul) 97 movups (%rdi), DATA 98 movups (%rsi), SHASH 99 movaps .Lbswap_mask, BSWAP 100 PSHUFB_XMM BSWAP DATA 101 call __clmul_gf128mul_ble 102 PSHUFB_XMM BSWAP DATA 103 movups DATA, (%rdi) 104 ret 105ENDPROC(clmul_ghash_mul) 106 107/* 108 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 109 * const u128 *shash); 110 */ 111ENTRY(clmul_ghash_update) 112 cmp $16, %rdx 113 jb .Lupdate_just_ret # check length 114 movaps .Lbswap_mask, BSWAP 115 movups (%rdi), DATA 116 movups (%rcx), SHASH 117 PSHUFB_XMM BSWAP DATA 118.align 4 119.Lupdate_loop: 120 movups (%rsi), IN1 121 PSHUFB_XMM BSWAP IN1 122 pxor IN1, DATA 123 call __clmul_gf128mul_ble 124 sub $16, %rdx 125 add $16, %rsi 126 cmp $16, %rdx 127 jge .Lupdate_loop 128 PSHUFB_XMM BSWAP DATA 129 movups DATA, (%rdi) 130.Lupdate_just_ret: 131 ret 132ENDPROC(clmul_ghash_update) 133