1/* 2 * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* GPL HEADER START 12 * 13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License version 2 only, 17 * as published by the Free Software Foundation. 18 * 19 * This program is distributed in the hope that it will be useful, but 20 * WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * General Public License version 2 for more details (a copy is included 23 * in the LICENSE file that accompanied this code). 24 * 25 * You should have received a copy of the GNU General Public License 26 * version 2 along with this program; If not, see http://www.gnu.org/licenses 27 * 28 * Please visit http://www.xyratex.com/contact if you need additional 29 * information or have any questions. 30 * 31 * GPL HEADER END 32 */ 33 34/* 35 * Copyright 2012 Xyratex Technology Limited 36 * 37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38 * calculation. 39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41 * at: 42 * http://www.intel.com/products/processor/manuals/ 43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44 * Volume 2B: Instruction Set Reference, N-Z 45 * 46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47 * Alexander Boyko <Alexander_Boyko@xyratex.com> 48 */ 49 50#include <linux/linkage.h> 51#include <asm/assembler.h> 52 53 .text 54 .align 6 55 .arch armv8-a 56 .arch_extension crc 57 .fpu crypto-neon-fp-armv8 58 59.Lcrc32_constants: 60 /* 61 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 62 * #define CONSTANT_R1 0x154442bd4LL 63 * 64 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 65 * #define CONSTANT_R2 0x1c6e41596LL 66 */ 67 .quad 0x0000000154442bd4 68 .quad 0x00000001c6e41596 69 70 /* 71 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 72 * #define CONSTANT_R3 0x1751997d0LL 73 * 74 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 75 * #define CONSTANT_R4 0x0ccaa009eLL 76 */ 77 .quad 0x00000001751997d0 78 .quad 0x00000000ccaa009e 79 80 /* 81 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 82 * #define CONSTANT_R5 0x163cd6124LL 83 */ 84 .quad 0x0000000163cd6124 85 .quad 0x00000000FFFFFFFF 86 87 /* 88 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 89 * 90 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 91 * = 0x1F7011641LL 92 * #define CONSTANT_RU 0x1F7011641LL 93 */ 94 .quad 0x00000001DB710641 95 .quad 0x00000001F7011641 96 97.Lcrc32c_constants: 98 .quad 0x00000000740eef02 99 .quad 0x000000009e4addf8 100 .quad 0x00000000f20c0dfe 101 .quad 0x000000014cd00bd6 102 .quad 0x00000000dd45aab8 103 .quad 0x00000000FFFFFFFF 104 .quad 0x0000000105ec76f0 105 .quad 0x00000000dea713f1 106 107 dCONSTANTl .req d0 108 dCONSTANTh .req d1 109 qCONSTANT .req q0 110 111 BUF .req r0 112 LEN .req r1 113 CRC .req r2 114 115 qzr .req q9 116 117 /** 118 * Calculate crc32 119 * BUF - buffer 120 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 121 * CRC - initial crc32 122 * return %eax crc32 123 * uint crc32_pmull_le(unsigned char const *buffer, 124 * size_t len, uint crc32) 125 */ 126ENTRY(crc32_pmull_le) 127 adr r3, .Lcrc32_constants 128 b 0f 129 130ENTRY(crc32c_pmull_le) 131 adr r3, .Lcrc32c_constants 132 1330: bic LEN, LEN, #15 134 vld1.8 {q1-q2}, [BUF, :128]! 135 vld1.8 {q3-q4}, [BUF, :128]! 136 vmov.i8 qzr, #0 137 vmov.i8 qCONSTANT, #0 138 vmov.32 dCONSTANTl[0], CRC 139 veor.8 d2, d2, dCONSTANTl 140 sub LEN, LEN, #0x40 141 cmp LEN, #0x40 142 blt less_64 143 144 vld1.64 {qCONSTANT}, [r3] 145 146loop_64: /* 64 bytes Full cache line folding */ 147 sub LEN, LEN, #0x40 148 149 vmull.p64 q5, d3, dCONSTANTh 150 vmull.p64 q6, d5, dCONSTANTh 151 vmull.p64 q7, d7, dCONSTANTh 152 vmull.p64 q8, d9, dCONSTANTh 153 154 vmull.p64 q1, d2, dCONSTANTl 155 vmull.p64 q2, d4, dCONSTANTl 156 vmull.p64 q3, d6, dCONSTANTl 157 vmull.p64 q4, d8, dCONSTANTl 158 159 veor.8 q1, q1, q5 160 vld1.8 {q5}, [BUF, :128]! 161 veor.8 q2, q2, q6 162 vld1.8 {q6}, [BUF, :128]! 163 veor.8 q3, q3, q7 164 vld1.8 {q7}, [BUF, :128]! 165 veor.8 q4, q4, q8 166 vld1.8 {q8}, [BUF, :128]! 167 168 veor.8 q1, q1, q5 169 veor.8 q2, q2, q6 170 veor.8 q3, q3, q7 171 veor.8 q4, q4, q8 172 173 cmp LEN, #0x40 174 bge loop_64 175 176less_64: /* Folding cache line into 128bit */ 177 vldr dCONSTANTl, [r3, #16] 178 vldr dCONSTANTh, [r3, #24] 179 180 vmull.p64 q5, d3, dCONSTANTh 181 vmull.p64 q1, d2, dCONSTANTl 182 veor.8 q1, q1, q5 183 veor.8 q1, q1, q2 184 185 vmull.p64 q5, d3, dCONSTANTh 186 vmull.p64 q1, d2, dCONSTANTl 187 veor.8 q1, q1, q5 188 veor.8 q1, q1, q3 189 190 vmull.p64 q5, d3, dCONSTANTh 191 vmull.p64 q1, d2, dCONSTANTl 192 veor.8 q1, q1, q5 193 veor.8 q1, q1, q4 194 195 teq LEN, #0 196 beq fold_64 197 198loop_16: /* Folding rest buffer into 128bit */ 199 subs LEN, LEN, #0x10 200 201 vld1.8 {q2}, [BUF, :128]! 202 vmull.p64 q5, d3, dCONSTANTh 203 vmull.p64 q1, d2, dCONSTANTl 204 veor.8 q1, q1, q5 205 veor.8 q1, q1, q2 206 207 bne loop_16 208 209fold_64: 210 /* perform the last 64 bit fold, also adds 32 zeroes 211 * to the input stream */ 212 vmull.p64 q2, d2, dCONSTANTh 213 vext.8 q1, q1, qzr, #8 214 veor.8 q1, q1, q2 215 216 /* final 32-bit fold */ 217 vldr dCONSTANTl, [r3, #32] 218 vldr d6, [r3, #40] 219 vmov.i8 d7, #0 220 221 vext.8 q2, q1, qzr, #4 222 vand.8 d2, d2, d6 223 vmull.p64 q1, d2, dCONSTANTl 224 veor.8 q1, q1, q2 225 226 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 227 vldr dCONSTANTl, [r3, #48] 228 vldr dCONSTANTh, [r3, #56] 229 230 vand.8 q2, q1, q3 231 vext.8 q2, qzr, q2, #8 232 vmull.p64 q2, d5, dCONSTANTh 233 vand.8 q2, q2, q3 234 vmull.p64 q2, d4, dCONSTANTl 235 veor.8 q1, q1, q2 236 vmov r0, s5 237 238 bx lr 239ENDPROC(crc32_pmull_le) 240ENDPROC(crc32c_pmull_le) 241 242 .macro __crc32, c 243 subs ip, r2, #8 244 bmi .Ltail\c 245 246 tst r1, #3 247 bne .Lunaligned\c 248 249 teq ip, #0 250.Laligned8\c: 251 ldrd r2, r3, [r1], #8 252ARM_BE8(rev r2, r2 ) 253ARM_BE8(rev r3, r3 ) 254 crc32\c\()w r0, r0, r2 255 crc32\c\()w r0, r0, r3 256 bxeq lr 257 subs ip, ip, #8 258 bpl .Laligned8\c 259 260.Ltail\c: 261 tst ip, #4 262 beq 2f 263 ldr r3, [r1], #4 264ARM_BE8(rev r3, r3 ) 265 crc32\c\()w r0, r0, r3 266 2672: tst ip, #2 268 beq 1f 269 ldrh r3, [r1], #2 270ARM_BE8(rev16 r3, r3 ) 271 crc32\c\()h r0, r0, r3 272 2731: tst ip, #1 274 bxeq lr 275 ldrb r3, [r1] 276 crc32\c\()b r0, r0, r3 277 bx lr 278 279.Lunaligned\c: 280 tst r1, #1 281 beq 2f 282 ldrb r3, [r1], #1 283 subs r2, r2, #1 284 crc32\c\()b r0, r0, r3 285 286 tst r1, #2 287 beq 0f 2882: ldrh r3, [r1], #2 289 subs r2, r2, #2 290ARM_BE8(rev16 r3, r3 ) 291 crc32\c\()h r0, r0, r3 292 2930: subs ip, r2, #8 294 bpl .Laligned8\c 295 b .Ltail\c 296 .endm 297 298 .align 5 299ENTRY(crc32_armv8_le) 300 __crc32 301ENDPROC(crc32_armv8_le) 302 303 .align 5 304ENTRY(crc32c_armv8_le) 305 __crc32 c 306ENDPROC(crc32c_armv8_le) 307