1/* 2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 3 * instructions. This file contains accelerated part of ghash 4 * implementation. More information about PCLMULQDQ can be found at: 5 * 6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 7 * 8 * Copyright (c) 2009 Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal 11 * Erdinc Ozturk 12 * Deniz Karakoyunlu 13 * 14 * This program is free software; you can redistribute it and/or modify it 15 * under the terms of the GNU General Public License version 2 as published 16 * by the Free Software Foundation. 17 */ 18 19#include <linux/linkage.h> 20#include <asm/inst.h> 21 22.data 23 24.align 16 25.Lbswap_mask: 26 .octa 0x000102030405060708090a0b0c0d0e0f 27.Lpoly: 28 .octa 0xc2000000000000000000000000000001 29.Ltwo_one: 30 .octa 0x00000001000000000000000000000001 31 32#define DATA %xmm0 33#define SHASH %xmm1 34#define T1 %xmm2 35#define T2 %xmm3 36#define T3 %xmm4 37#define BSWAP %xmm5 38#define IN1 %xmm6 39 40.text 41 42/* 43 * __clmul_gf128mul_ble: internal ABI 44 * input: 45 * DATA: operand1 46 * SHASH: operand2, hash_key << 1 mod poly 47 * output: 48 * DATA: operand1 * operand2 mod poly 49 * changed: 50 * T1 51 * T2 52 * T3 53 */ 54__clmul_gf128mul_ble: 55 movaps DATA, T1 56 pshufd $0b01001110, DATA, T2 57 pshufd $0b01001110, SHASH, T3 58 pxor DATA, T2 59 pxor SHASH, T3 60 61 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 62 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 63 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) 64 pxor DATA, T2 65 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 66 67 movaps T2, T3 68 pslldq $8, T3 69 psrldq $8, T2 70 pxor T3, DATA 71 pxor T2, T1 # <T1:DATA> is result of 72 # carry-less multiplication 73 74 # first phase of the reduction 75 movaps DATA, T3 76 psllq $1, T3 77 pxor DATA, T3 78 psllq $5, T3 79 pxor DATA, T3 80 psllq $57, T3 81 movaps T3, T2 82 pslldq $8, T2 83 psrldq $8, T3 84 pxor T2, DATA 85 pxor T3, T1 86 87 # second phase of the reduction 88 movaps DATA, T2 89 psrlq $5, T2 90 pxor DATA, T2 91 psrlq $1, T2 92 pxor DATA, T2 93 psrlq $1, T2 94 pxor T2, T1 95 pxor T1, DATA 96 ret 97 98/* void clmul_ghash_mul(char *dst, const be128 *shash) */ 99ENTRY(clmul_ghash_mul) 100 movups (%rdi), DATA 101 movups (%rsi), SHASH 102 movaps .Lbswap_mask, BSWAP 103 PSHUFB_XMM BSWAP DATA 104 call __clmul_gf128mul_ble 105 PSHUFB_XMM BSWAP DATA 106 movups DATA, (%rdi) 107 ret 108 109/* 110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 111 * const be128 *shash); 112 */ 113ENTRY(clmul_ghash_update) 114 cmp $16, %rdx 115 jb .Lupdate_just_ret # check length 116 movaps .Lbswap_mask, BSWAP 117 movups (%rdi), DATA 118 movups (%rcx), SHASH 119 PSHUFB_XMM BSWAP DATA 120.align 4 121.Lupdate_loop: 122 movups (%rsi), IN1 123 PSHUFB_XMM BSWAP IN1 124 pxor IN1, DATA 125 call __clmul_gf128mul_ble 126 sub $16, %rdx 127 add $16, %rsi 128 cmp $16, %rdx 129 jge .Lupdate_loop 130 PSHUFB_XMM BSWAP DATA 131 movups DATA, (%rdi) 132.Lupdate_just_ret: 133 ret 134 135/* 136 * void clmul_ghash_setkey(be128 *shash, const u8 *key); 137 * 138 * Calculate hash_key << 1 mod poly 139 */ 140ENTRY(clmul_ghash_setkey) 141 movaps .Lbswap_mask, BSWAP 142 movups (%rsi), %xmm0 143 PSHUFB_XMM BSWAP %xmm0 144 movaps %xmm0, %xmm1 145 psllq $1, %xmm0 146 psrlq $63, %xmm1 147 movaps %xmm1, %xmm2 148 pslldq $8, %xmm1 149 psrldq $8, %xmm2 150 por %xmm1, %xmm0 151 # reduction 152 pshufd $0b00100100, %xmm2, %xmm1 153 pcmpeqd .Ltwo_one, %xmm1 154 pand .Lpoly, %xmm1 155 pxor %xmm1, %xmm0 156 movups %xmm0, (%rdi) 157 ret 158