1/* GPL HEADER START 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 only, 7 * as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, but 10 * WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * General Public License version 2 for more details (a copy is included 13 * in the LICENSE file that accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License 16 * version 2 along with this program; If not, see http://www.gnu.org/licenses 17 * 18 * Please visit http://www.xyratex.com/contact if you need additional 19 * information or have any questions. 20 * 21 * GPL HEADER END 22 */ 23 24/* 25 * Copyright 2012 Xyratex Technology Limited 26 * 27 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 28 * calculation. 29 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 30 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 31 * at: 32 * http://www.intel.com/products/processor/manuals/ 33 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 34 * Volume 2B: Instruction Set Reference, N-Z 35 * 36 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 37 * Alexander Boyko <Alexander_Boyko@xyratex.com> 38 */ 39 40#include <linux/linkage.h> 41#include <asm/inst.h> 42 43 44.section .rodata 45.align 16 46/* 47 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 48 * #define CONSTANT_R1 0x154442bd4LL 49 * 50 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 51 * #define CONSTANT_R2 0x1c6e41596LL 52 */ 53.Lconstant_R2R1: 54 .octa 0x00000001c6e415960000000154442bd4 55/* 56 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 57 * #define CONSTANT_R3 0x1751997d0LL 58 * 59 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 60 * #define CONSTANT_R4 0x0ccaa009eLL 61 */ 62.Lconstant_R4R3: 63 .octa 0x00000000ccaa009e00000001751997d0 64/* 65 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 66 * #define CONSTANT_R5 0x163cd6124LL 67 */ 68.Lconstant_R5: 69 .octa 0x00000000000000000000000163cd6124 70.Lconstant_mask32: 71 .octa 0x000000000000000000000000FFFFFFFF 72/* 73 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 74 * 75 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 76 * #define CONSTANT_RU 0x1F7011641LL 77 */ 78.Lconstant_RUpoly: 79 .octa 0x00000001F701164100000001DB710641 80 81#define CONSTANT %xmm0 82 83#ifdef __x86_64__ 84#define BUF %rdi 85#define LEN %rsi 86#define CRC %edx 87#else 88#define BUF %eax 89#define LEN %edx 90#define CRC %ecx 91#endif 92 93 94 95.text 96/** 97 * Calculate crc32 98 * BUF - buffer (16 bytes aligned) 99 * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 100 * CRC - initial crc32 101 * return %eax crc32 102 * uint crc32_pclmul_le_16(unsigned char const *buffer, 103 * size_t len, uint crc32) 104 */ 105 106SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 107 movdqa (BUF), %xmm1 108 movdqa 0x10(BUF), %xmm2 109 movdqa 0x20(BUF), %xmm3 110 movdqa 0x30(BUF), %xmm4 111 movd CRC, CONSTANT 112 pxor CONSTANT, %xmm1 113 sub $0x40, LEN 114 add $0x40, BUF 115 cmp $0x40, LEN 116 jb less_64 117 118#ifdef __x86_64__ 119 movdqa .Lconstant_R2R1(%rip), CONSTANT 120#else 121 movdqa .Lconstant_R2R1, CONSTANT 122#endif 123 124loop_64:/* 64 bytes Full cache line folding */ 125 prefetchnta 0x40(BUF) 126 movdqa %xmm1, %xmm5 127 movdqa %xmm2, %xmm6 128 movdqa %xmm3, %xmm7 129#ifdef __x86_64__ 130 movdqa %xmm4, %xmm8 131#endif 132 PCLMULQDQ 00, CONSTANT, %xmm1 133 PCLMULQDQ 00, CONSTANT, %xmm2 134 PCLMULQDQ 00, CONSTANT, %xmm3 135#ifdef __x86_64__ 136 PCLMULQDQ 00, CONSTANT, %xmm4 137#endif 138 PCLMULQDQ 0x11, CONSTANT, %xmm5 139 PCLMULQDQ 0x11, CONSTANT, %xmm6 140 PCLMULQDQ 0x11, CONSTANT, %xmm7 141#ifdef __x86_64__ 142 PCLMULQDQ 0x11, CONSTANT, %xmm8 143#endif 144 pxor %xmm5, %xmm1 145 pxor %xmm6, %xmm2 146 pxor %xmm7, %xmm3 147#ifdef __x86_64__ 148 pxor %xmm8, %xmm4 149#else 150 /* xmm8 unsupported for x32 */ 151 movdqa %xmm4, %xmm5 152 PCLMULQDQ 00, CONSTANT, %xmm4 153 PCLMULQDQ 0x11, CONSTANT, %xmm5 154 pxor %xmm5, %xmm4 155#endif 156 157 pxor (BUF), %xmm1 158 pxor 0x10(BUF), %xmm2 159 pxor 0x20(BUF), %xmm3 160 pxor 0x30(BUF), %xmm4 161 162 sub $0x40, LEN 163 add $0x40, BUF 164 cmp $0x40, LEN 165 jge loop_64 166less_64:/* Folding cache line into 128bit */ 167#ifdef __x86_64__ 168 movdqa .Lconstant_R4R3(%rip), CONSTANT 169#else 170 movdqa .Lconstant_R4R3, CONSTANT 171#endif 172 prefetchnta (BUF) 173 174 movdqa %xmm1, %xmm5 175 PCLMULQDQ 0x00, CONSTANT, %xmm1 176 PCLMULQDQ 0x11, CONSTANT, %xmm5 177 pxor %xmm5, %xmm1 178 pxor %xmm2, %xmm1 179 180 movdqa %xmm1, %xmm5 181 PCLMULQDQ 0x00, CONSTANT, %xmm1 182 PCLMULQDQ 0x11, CONSTANT, %xmm5 183 pxor %xmm5, %xmm1 184 pxor %xmm3, %xmm1 185 186 movdqa %xmm1, %xmm5 187 PCLMULQDQ 0x00, CONSTANT, %xmm1 188 PCLMULQDQ 0x11, CONSTANT, %xmm5 189 pxor %xmm5, %xmm1 190 pxor %xmm4, %xmm1 191 192 cmp $0x10, LEN 193 jb fold_64 194loop_16:/* Folding rest buffer into 128bit */ 195 movdqa %xmm1, %xmm5 196 PCLMULQDQ 0x00, CONSTANT, %xmm1 197 PCLMULQDQ 0x11, CONSTANT, %xmm5 198 pxor %xmm5, %xmm1 199 pxor (BUF), %xmm1 200 sub $0x10, LEN 201 add $0x10, BUF 202 cmp $0x10, LEN 203 jge loop_16 204 205fold_64: 206 /* perform the last 64 bit fold, also adds 32 zeroes 207 * to the input stream */ 208 PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 209 psrldq $0x08, %xmm1 210 pxor CONSTANT, %xmm1 211 212 /* final 32-bit fold */ 213 movdqa %xmm1, %xmm2 214#ifdef __x86_64__ 215 movdqa .Lconstant_R5(%rip), CONSTANT 216 movdqa .Lconstant_mask32(%rip), %xmm3 217#else 218 movdqa .Lconstant_R5, CONSTANT 219 movdqa .Lconstant_mask32, %xmm3 220#endif 221 psrldq $0x04, %xmm2 222 pand %xmm3, %xmm1 223 PCLMULQDQ 0x00, CONSTANT, %xmm1 224 pxor %xmm2, %xmm1 225 226 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 227#ifdef __x86_64__ 228 movdqa .Lconstant_RUpoly(%rip), CONSTANT 229#else 230 movdqa .Lconstant_RUpoly, CONSTANT 231#endif 232 movdqa %xmm1, %xmm2 233 pand %xmm3, %xmm1 234 PCLMULQDQ 0x10, CONSTANT, %xmm1 235 pand %xmm3, %xmm1 236 PCLMULQDQ 0x00, CONSTANT, %xmm1 237 pxor %xmm2, %xmm1 238 PEXTRD 0x01, %xmm1, %eax 239 240 ret 241SYM_FUNC_END(crc32_pclmul_le_16) 242