1/* 2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 3 * instructions. This file contains accelerated part of ghash 4 * implementation. More information about PCLMULQDQ can be found at: 5 * 6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 7 * 8 * Copyright (c) 2009 Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal 11 * Erdinc Ozturk 12 * Deniz Karakoyunlu 13 * 14 * This program is free software; you can redistribute it and/or modify it 15 * under the terms of the GNU General Public License version 2 as published 16 * by the Free Software Foundation. 17 */ 18 19#include <linux/linkage.h> 20#include <asm/inst.h> 21#include <asm/frame.h> 22 23.data 24 25.align 16 26.Lbswap_mask: 27 .octa 0x000102030405060708090a0b0c0d0e0f 28 29#define DATA %xmm0 30#define SHASH %xmm1 31#define T1 %xmm2 32#define T2 %xmm3 33#define T3 %xmm4 34#define BSWAP %xmm5 35#define IN1 %xmm6 36 37.text 38 39/* 40 * __clmul_gf128mul_ble: internal ABI 41 * input: 42 * DATA: operand1 43 * SHASH: operand2, hash_key << 1 mod poly 44 * output: 45 * DATA: operand1 * operand2 mod poly 46 * changed: 47 * T1 48 * T2 49 * T3 50 */ 51__clmul_gf128mul_ble: 52 movaps DATA, T1 53 pshufd $0b01001110, DATA, T2 54 pshufd $0b01001110, SHASH, T3 55 pxor DATA, T2 56 pxor SHASH, T3 57 58 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 59 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 60 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) 61 pxor DATA, T2 62 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 63 64 movaps T2, T3 65 pslldq $8, T3 66 psrldq $8, T2 67 pxor T3, DATA 68 pxor T2, T1 # <T1:DATA> is result of 69 # carry-less multiplication 70 71 # first phase of the reduction 72 movaps DATA, T3 73 psllq $1, T3 74 pxor DATA, T3 75 psllq $5, T3 76 pxor DATA, T3 77 psllq $57, T3 78 movaps T3, T2 79 pslldq $8, T2 80 psrldq $8, T3 81 pxor T2, DATA 82 pxor T3, T1 83 84 # second phase of the reduction 85 movaps DATA, T2 86 psrlq $5, T2 87 pxor DATA, T2 88 psrlq $1, T2 89 pxor DATA, T2 90 psrlq $1, T2 91 pxor T2, T1 92 pxor T1, DATA 93 ret 94ENDPROC(__clmul_gf128mul_ble) 95 96/* void clmul_ghash_mul(char *dst, const u128 *shash) */ 97ENTRY(clmul_ghash_mul) 98 FRAME_BEGIN 99 movups (%rdi), DATA 100 movups (%rsi), SHASH 101 movaps .Lbswap_mask, BSWAP 102 PSHUFB_XMM BSWAP DATA 103 call __clmul_gf128mul_ble 104 PSHUFB_XMM BSWAP DATA 105 movups DATA, (%rdi) 106 FRAME_END 107 ret 108ENDPROC(clmul_ghash_mul) 109 110/* 111 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 112 * const u128 *shash); 113 */ 114ENTRY(clmul_ghash_update) 115 FRAME_BEGIN 116 cmp $16, %rdx 117 jb .Lupdate_just_ret # check length 118 movaps .Lbswap_mask, BSWAP 119 movups (%rdi), DATA 120 movups (%rcx), SHASH 121 PSHUFB_XMM BSWAP DATA 122.align 4 123.Lupdate_loop: 124 movups (%rsi), IN1 125 PSHUFB_XMM BSWAP IN1 126 pxor IN1, DATA 127 call __clmul_gf128mul_ble 128 sub $16, %rdx 129 add $16, %rsi 130 cmp $16, %rdx 131 jge .Lupdate_loop 132 PSHUFB_XMM BSWAP DATA 133 movups DATA, (%rdi) 134.Lupdate_just_ret: 135 FRAME_END 136 ret 137ENDPROC(clmul_ghash_update) 138