1/* 2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 3 * instructions. This file contains accelerated part of ghash 4 * implementation. More information about PCLMULQDQ can be found at: 5 * 6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 7 * 8 * Copyright (c) 2009 Intel Corp. 9 * Author: Huang Ying <ying.huang@intel.com> 10 * Vinodh Gopal 11 * Erdinc Ozturk 12 * Deniz Karakoyunlu 13 * 14 * This program is free software; you can redistribute it and/or modify it 15 * under the terms of the GNU General Public License version 2 as published 16 * by the Free Software Foundation. 17 */ 18 19#include <linux/linkage.h> 20#include <asm/inst.h> 21#include <asm/frame.h> 22 23.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 24.align 16 25.Lbswap_mask: 26 .octa 0x000102030405060708090a0b0c0d0e0f 27 28#define DATA %xmm0 29#define SHASH %xmm1 30#define T1 %xmm2 31#define T2 %xmm3 32#define T3 %xmm4 33#define BSWAP %xmm5 34#define IN1 %xmm6 35 36.text 37 38/* 39 * __clmul_gf128mul_ble: internal ABI 40 * input: 41 * DATA: operand1 42 * SHASH: operand2, hash_key << 1 mod poly 43 * output: 44 * DATA: operand1 * operand2 mod poly 45 * changed: 46 * T1 47 * T2 48 * T3 49 */ 50__clmul_gf128mul_ble: 51 movaps DATA, T1 52 pshufd $0b01001110, DATA, T2 53 pshufd $0b01001110, SHASH, T3 54 pxor DATA, T2 55 pxor SHASH, T3 56 57 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 58 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 59 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) 60 pxor DATA, T2 61 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 62 63 movaps T2, T3 64 pslldq $8, T3 65 psrldq $8, T2 66 pxor T3, DATA 67 pxor T2, T1 # <T1:DATA> is result of 68 # carry-less multiplication 69 70 # first phase of the reduction 71 movaps DATA, T3 72 psllq $1, T3 73 pxor DATA, T3 74 psllq $5, T3 75 pxor DATA, T3 76 psllq $57, T3 77 movaps T3, T2 78 pslldq $8, T2 79 psrldq $8, T3 80 pxor T2, DATA 81 pxor T3, T1 82 83 # second phase of the reduction 84 movaps DATA, T2 85 psrlq $5, T2 86 pxor DATA, T2 87 psrlq $1, T2 88 pxor DATA, T2 89 psrlq $1, T2 90 pxor T2, T1 91 pxor T1, DATA 92 ret 93ENDPROC(__clmul_gf128mul_ble) 94 95/* void clmul_ghash_mul(char *dst, const u128 *shash) */ 96ENTRY(clmul_ghash_mul) 97 FRAME_BEGIN 98 movups (%rdi), DATA 99 movups (%rsi), SHASH 100 movaps .Lbswap_mask, BSWAP 101 PSHUFB_XMM BSWAP DATA 102 call __clmul_gf128mul_ble 103 PSHUFB_XMM BSWAP DATA 104 movups DATA, (%rdi) 105 FRAME_END 106 ret 107ENDPROC(clmul_ghash_mul) 108 109/* 110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 111 * const u128 *shash); 112 */ 113ENTRY(clmul_ghash_update) 114 FRAME_BEGIN 115 cmp $16, %rdx 116 jb .Lupdate_just_ret # check length 117 movaps .Lbswap_mask, BSWAP 118 movups (%rdi), DATA 119 movups (%rcx), SHASH 120 PSHUFB_XMM BSWAP DATA 121.align 4 122.Lupdate_loop: 123 movups (%rsi), IN1 124 PSHUFB_XMM BSWAP IN1 125 pxor IN1, DATA 126 call __clmul_gf128mul_ble 127 sub $16, %rdx 128 add $16, %rsi 129 cmp $16, %rdx 130 jge .Lupdate_loop 131 PSHUFB_XMM BSWAP DATA 132 movups DATA, (%rdi) 133.Lupdate_just_ret: 134 FRAME_END 135 ret 136ENDPROC(clmul_ghash_update) 137