1// 2// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions 3// 4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5// 6// This program is free software; you can redistribute it and/or modify 7// it under the terms of the GNU General Public License version 2 as 8// published by the Free Software Foundation. 9// 10 11// 12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 13// 14// Copyright (c) 2013, Intel Corporation 15// 16// Authors: 17// Erdinc Ozturk <erdinc.ozturk@intel.com> 18// Vinodh Gopal <vinodh.gopal@intel.com> 19// James Guilford <james.guilford@intel.com> 20// Tim Chen <tim.c.chen@linux.intel.com> 21// 22// This software is available to you under a choice of one of two 23// licenses. You may choose to be licensed under the terms of the GNU 24// General Public License (GPL) Version 2, available from the file 25// COPYING in the main directory of this source tree, or the 26// OpenIB.org BSD license below: 27// 28// Redistribution and use in source and binary forms, with or without 29// modification, are permitted provided that the following conditions are 30// met: 31// 32// * Redistributions of source code must retain the above copyright 33// notice, this list of conditions and the following disclaimer. 34// 35// * Redistributions in binary form must reproduce the above copyright 36// notice, this list of conditions and the following disclaimer in the 37// documentation and/or other materials provided with the 38// distribution. 39// 40// * Neither the name of the Intel Corporation nor the names of its 41// contributors may be used to endorse or promote products derived from 42// this software without specific prior written permission. 43// 44// 45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 56// 57// Function API: 58// UINT16 crc_t10dif_pcl( 59// UINT16 init_crc, //initial CRC value, 16 bits 60// const unsigned char *buf, //buffer pointer to calculate CRC on 61// UINT64 len //buffer length in bytes (64-bit data) 62// ); 63// 64// Reference paper titled "Fast CRC Computation for Generic 65// Polynomials Using PCLMULQDQ Instruction" 66// URL: http://www.intel.com/content/dam/www/public/us/en/documents 67// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 68// 69// 70 71#include <linux/linkage.h> 72#include <asm/assembler.h> 73 74#ifdef CONFIG_CPU_ENDIAN_BE8 75#define CPU_LE(code...) 76#else 77#define CPU_LE(code...) code 78#endif 79 80 .text 81 .fpu crypto-neon-fp-armv8 82 83 arg1_low32 .req r0 84 arg2 .req r1 85 arg3 .req r2 86 87 qzr .req q13 88 89 q0l .req d0 90 q0h .req d1 91 q1l .req d2 92 q1h .req d3 93 q2l .req d4 94 q2h .req d5 95 q3l .req d6 96 q3h .req d7 97 q4l .req d8 98 q4h .req d9 99 q5l .req d10 100 q5h .req d11 101 q6l .req d12 102 q6h .req d13 103 q7l .req d14 104 q7h .req d15 105 106ENTRY(crc_t10dif_pmull) 107 vmov.i8 qzr, #0 // init zero register 108 109 // adjust the 16-bit initial_crc value, scale it to 32 bits 110 lsl arg1_low32, arg1_low32, #16 111 112 // check if smaller than 256 113 cmp arg3, #256 114 115 // for sizes less than 128, we can't fold 64B at a time... 116 blt _less_than_128 117 118 // load the initial crc value 119 // crc value does not need to be byte-reflected, but it needs 120 // to be moved to the high part of the register. 121 // because data will be byte-reflected and will align with 122 // initial crc at correct place. 123 vmov s0, arg1_low32 // initial crc 124 vext.8 q10, qzr, q0, #4 125 126 // receive the initial 64B data, xor the initial crc value 127 vld1.64 {q0-q1}, [arg2, :128]! 128 vld1.64 {q2-q3}, [arg2, :128]! 129 vld1.64 {q4-q5}, [arg2, :128]! 130 vld1.64 {q6-q7}, [arg2, :128]! 131CPU_LE( vrev64.8 q0, q0 ) 132CPU_LE( vrev64.8 q1, q1 ) 133CPU_LE( vrev64.8 q2, q2 ) 134CPU_LE( vrev64.8 q3, q3 ) 135CPU_LE( vrev64.8 q4, q4 ) 136CPU_LE( vrev64.8 q5, q5 ) 137CPU_LE( vrev64.8 q6, q6 ) 138CPU_LE( vrev64.8 q7, q7 ) 139 140 vswp d0, d1 141 vswp d2, d3 142 vswp d4, d5 143 vswp d6, d7 144 vswp d8, d9 145 vswp d10, d11 146 vswp d12, d13 147 vswp d14, d15 148 149 // XOR the initial_crc value 150 veor.8 q0, q0, q10 151 152 adr ip, rk3 153 vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4 154 155 // 156 // we subtract 256 instead of 128 to save one instruction from the loop 157 // 158 sub arg3, arg3, #256 159 160 // at this section of the code, there is 64*x+y (0<=y<64) bytes of 161 // buffer. The _fold_64_B_loop will fold 64B at a time 162 // until we have 64+y Bytes of buffer 163 164 165 // fold 64B at a time. This section of the code folds 4 vector 166 // registers in parallel 167_fold_64_B_loop: 168 169 .macro fold64, reg1, reg2 170 vld1.64 {q11-q12}, [arg2, :128]! 171 172 vmull.p64 q8, \reg1\()h, d21 173 vmull.p64 \reg1, \reg1\()l, d20 174 vmull.p64 q9, \reg2\()h, d21 175 vmull.p64 \reg2, \reg2\()l, d20 176 177CPU_LE( vrev64.8 q11, q11 ) 178CPU_LE( vrev64.8 q12, q12 ) 179 vswp d22, d23 180 vswp d24, d25 181 182 veor.8 \reg1, \reg1, q8 183 veor.8 \reg2, \reg2, q9 184 veor.8 \reg1, \reg1, q11 185 veor.8 \reg2, \reg2, q12 186 .endm 187 188 fold64 q0, q1 189 fold64 q2, q3 190 fold64 q4, q5 191 fold64 q6, q7 192 193 subs arg3, arg3, #128 194 195 // check if there is another 64B in the buffer to be able to fold 196 bge _fold_64_B_loop 197 198 // at this point, the buffer pointer is pointing at the last y Bytes 199 // of the buffer the 64B of folded data is in 4 of the vector 200 // registers: v0, v1, v2, v3 201 202 // fold the 8 vector registers to 1 vector register with different 203 // constants 204 205 adr ip, rk9 206 vld1.64 {q10}, [ip, :128]! 207 208 .macro fold16, reg, rk 209 vmull.p64 q8, \reg\()l, d20 210 vmull.p64 \reg, \reg\()h, d21 211 .ifnb \rk 212 vld1.64 {q10}, [ip, :128]! 213 .endif 214 veor.8 q7, q7, q8 215 veor.8 q7, q7, \reg 216 .endm 217 218 fold16 q0, rk11 219 fold16 q1, rk13 220 fold16 q2, rk15 221 fold16 q3, rk17 222 fold16 q4, rk19 223 fold16 q5, rk1 224 fold16 q6 225 226 // instead of 64, we add 48 to the loop counter to save 1 instruction 227 // from the loop instead of a cmp instruction, we use the negative 228 // flag with the jl instruction 229 adds arg3, arg3, #(128-16) 230 blt _final_reduction_for_128 231 232 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 233 // and the rest is in memory. We can fold 16 bytes at a time if y>=16 234 // continue folding 16B at a time 235 236_16B_reduction_loop: 237 vmull.p64 q8, d14, d20 238 vmull.p64 q7, d15, d21 239 veor.8 q7, q7, q8 240 241 vld1.64 {q0}, [arg2, :128]! 242CPU_LE( vrev64.8 q0, q0 ) 243 vswp d0, d1 244 veor.8 q7, q7, q0 245 subs arg3, arg3, #16 246 247 // instead of a cmp instruction, we utilize the flags with the 248 // jge instruction equivalent of: cmp arg3, 16-16 249 // check if there is any more 16B in the buffer to be able to fold 250 bge _16B_reduction_loop 251 252 // now we have 16+z bytes left to reduce, where 0<= z < 16. 253 // first, we reduce the data in the xmm7 register 254 255_final_reduction_for_128: 256 // check if any more data to fold. If not, compute the CRC of 257 // the final 128 bits 258 adds arg3, arg3, #16 259 beq _128_done 260 261 // here we are getting data that is less than 16 bytes. 262 // since we know that there was data before the pointer, we can 263 // offset the input pointer before the actual point, to receive 264 // exactly 16 bytes. after that the registers need to be adjusted. 265_get_last_two_regs: 266 add arg2, arg2, arg3 267 sub arg2, arg2, #16 268 vld1.64 {q1}, [arg2] 269CPU_LE( vrev64.8 q1, q1 ) 270 vswp d2, d3 271 272 // get rid of the extra data that was loaded before 273 // load the shift constant 274 adr ip, tbl_shf_table + 16 275 sub ip, ip, arg3 276 vld1.8 {q0}, [ip] 277 278 // shift v2 to the left by arg3 bytes 279 vtbl.8 d4, {d14-d15}, d0 280 vtbl.8 d5, {d14-d15}, d1 281 282 // shift v7 to the right by 16-arg3 bytes 283 vmov.i8 q9, #0x80 284 veor.8 q0, q0, q9 285 vtbl.8 d18, {d14-d15}, d0 286 vtbl.8 d19, {d14-d15}, d1 287 288 // blend 289 vshr.s8 q0, q0, #7 // convert to 8-bit mask 290 vbsl.8 q0, q2, q1 291 292 // fold 16 Bytes 293 vmull.p64 q8, d18, d20 294 vmull.p64 q7, d19, d21 295 veor.8 q7, q7, q8 296 veor.8 q7, q7, q0 297 298_128_done: 299 // compute crc of a 128-bit value 300 vldr d20, rk5 301 vldr d21, rk6 // rk5 and rk6 in xmm10 302 303 // 64b fold 304 vext.8 q0, qzr, q7, #8 305 vmull.p64 q7, d15, d20 306 veor.8 q7, q7, q0 307 308 // 32b fold 309 vext.8 q0, q7, qzr, #12 310 vmov s31, s3 311 vmull.p64 q0, d0, d21 312 veor.8 q7, q0, q7 313 314 // barrett reduction 315_barrett: 316 vldr d20, rk7 317 vldr d21, rk8 318 319 vmull.p64 q0, d15, d20 320 vext.8 q0, qzr, q0, #12 321 vmull.p64 q0, d1, d21 322 vext.8 q0, qzr, q0, #12 323 veor.8 q7, q7, q0 324 vmov r0, s29 325 326_cleanup: 327 // scale the result back to 16 bits 328 lsr r0, r0, #16 329 bx lr 330 331_less_than_128: 332 teq arg3, #0 333 beq _cleanup 334 335 vmov.i8 q0, #0 336 vmov s3, arg1_low32 // get the initial crc value 337 338 vld1.64 {q7}, [arg2, :128]! 339CPU_LE( vrev64.8 q7, q7 ) 340 vswp d14, d15 341 veor.8 q7, q7, q0 342 343 cmp arg3, #16 344 beq _128_done // exactly 16 left 345 blt _less_than_16_left 346 347 // now if there is, load the constants 348 vldr d20, rk1 349 vldr d21, rk2 // rk1 and rk2 in xmm10 350 351 // check if there is enough buffer to be able to fold 16B at a time 352 subs arg3, arg3, #32 353 addlt arg3, arg3, #16 354 blt _get_last_two_regs 355 b _16B_reduction_loop 356 357_less_than_16_left: 358 // shl r9, 4 359 adr ip, tbl_shf_table + 16 360 sub ip, ip, arg3 361 vld1.8 {q0}, [ip] 362 vmov.i8 q9, #0x80 363 veor.8 q0, q0, q9 364 vtbl.8 d18, {d14-d15}, d0 365 vtbl.8 d15, {d14-d15}, d1 366 vmov d14, d18 367 b _128_done 368ENDPROC(crc_t10dif_pmull) 369 370// precomputed constants 371// these constants are precomputed from the poly: 372// 0x8bb70000 (0x8bb7 scaled to 32 bits) 373 .align 4 374// Q = 0x18BB70000 375// rk1 = 2^(32*3) mod Q << 32 376// rk2 = 2^(32*5) mod Q << 32 377// rk3 = 2^(32*15) mod Q << 32 378// rk4 = 2^(32*17) mod Q << 32 379// rk5 = 2^(32*3) mod Q << 32 380// rk6 = 2^(32*2) mod Q << 32 381// rk7 = floor(2^64/Q) 382// rk8 = Q 383 384rk3: .quad 0x9d9d000000000000 385rk4: .quad 0x7cf5000000000000 386rk5: .quad 0x2d56000000000000 387rk6: .quad 0x1368000000000000 388rk7: .quad 0x00000001f65a57f8 389rk8: .quad 0x000000018bb70000 390rk9: .quad 0xceae000000000000 391rk10: .quad 0xbfd6000000000000 392rk11: .quad 0x1e16000000000000 393rk12: .quad 0x713c000000000000 394rk13: .quad 0xf7f9000000000000 395rk14: .quad 0x80a6000000000000 396rk15: .quad 0x044c000000000000 397rk16: .quad 0xe658000000000000 398rk17: .quad 0xad18000000000000 399rk18: .quad 0xa497000000000000 400rk19: .quad 0x6ee3000000000000 401rk20: .quad 0xe7b5000000000000 402rk1: .quad 0x2d56000000000000 403rk2: .quad 0x06df000000000000 404 405tbl_shf_table: 406// use these values for shift constants for the tbl/tbx instruction 407// different alignments result in values as shown: 408// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 409// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 410// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 411// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 412// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 413// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 414// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 415// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 416// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 417// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 418// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 419// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 420// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 421// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 422// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 423 424 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 425 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 426 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 427 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 428