1// 2// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions 3// 4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5// 6// This program is free software; you can redistribute it and/or modify 7// it under the terms of the GNU General Public License version 2 as 8// published by the Free Software Foundation. 9// 10 11// 12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 13// 14// Copyright (c) 2013, Intel Corporation 15// 16// Authors: 17// Erdinc Ozturk <erdinc.ozturk@intel.com> 18// Vinodh Gopal <vinodh.gopal@intel.com> 19// James Guilford <james.guilford@intel.com> 20// Tim Chen <tim.c.chen@linux.intel.com> 21// 22// This software is available to you under a choice of one of two 23// licenses. You may choose to be licensed under the terms of the GNU 24// General Public License (GPL) Version 2, available from the file 25// COPYING in the main directory of this source tree, or the 26// OpenIB.org BSD license below: 27// 28// Redistribution and use in source and binary forms, with or without 29// modification, are permitted provided that the following conditions are 30// met: 31// 32// * Redistributions of source code must retain the above copyright 33// notice, this list of conditions and the following disclaimer. 34// 35// * Redistributions in binary form must reproduce the above copyright 36// notice, this list of conditions and the following disclaimer in the 37// documentation and/or other materials provided with the 38// distribution. 39// 40// * Neither the name of the Intel Corporation nor the names of its 41// contributors may be used to endorse or promote products derived from 42// this software without specific prior written permission. 43// 44// 45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 56// 57// Function API: 58// UINT16 crc_t10dif_pcl( 59// UINT16 init_crc, //initial CRC value, 16 bits 60// const unsigned char *buf, //buffer pointer to calculate CRC on 61// UINT64 len //buffer length in bytes (64-bit data) 62// ); 63// 64// Reference paper titled "Fast CRC Computation for Generic 65// Polynomials Using PCLMULQDQ Instruction" 66// URL: http://www.intel.com/content/dam/www/public/us/en/documents 67// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 68// 69// 70 71#include <linux/linkage.h> 72#include <asm/assembler.h> 73 74 .text 75 .cpu generic+crypto 76 77 arg1_low32 .req w19 78 arg2 .req x20 79 arg3 .req x21 80 81 vzr .req v13 82 83ENTRY(crc_t10dif_pmull) 84 frame_push 3, 128 85 86 mov arg1_low32, w0 87 mov arg2, x1 88 mov arg3, x2 89 90 movi vzr.16b, #0 // init zero register 91 92 // adjust the 16-bit initial_crc value, scale it to 32 bits 93 lsl arg1_low32, arg1_low32, #16 94 95 // check if smaller than 256 96 cmp arg3, #256 97 98 // for sizes less than 128, we can't fold 64B at a time... 99 b.lt _less_than_128 100 101 // load the initial crc value 102 // crc value does not need to be byte-reflected, but it needs 103 // to be moved to the high part of the register. 104 // because data will be byte-reflected and will align with 105 // initial crc at correct place. 106 movi v10.16b, #0 107 mov v10.s[3], arg1_low32 // initial crc 108 109 // receive the initial 64B data, xor the initial crc value 110 ldp q0, q1, [arg2] 111 ldp q2, q3, [arg2, #0x20] 112 ldp q4, q5, [arg2, #0x40] 113 ldp q6, q7, [arg2, #0x60] 114 add arg2, arg2, #0x80 115 116CPU_LE( rev64 v0.16b, v0.16b ) 117CPU_LE( rev64 v1.16b, v1.16b ) 118CPU_LE( rev64 v2.16b, v2.16b ) 119CPU_LE( rev64 v3.16b, v3.16b ) 120CPU_LE( rev64 v4.16b, v4.16b ) 121CPU_LE( rev64 v5.16b, v5.16b ) 122CPU_LE( rev64 v6.16b, v6.16b ) 123CPU_LE( rev64 v7.16b, v7.16b ) 124 125CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 126CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 127CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) 128CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) 129CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) 130CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) 131CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) 132CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 133 134 // XOR the initial_crc value 135 eor v0.16b, v0.16b, v10.16b 136 137 ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4 138 // type of pmull instruction 139 // will determine which constant to use 140 141 // 142 // we subtract 256 instead of 128 to save one instruction from the loop 143 // 144 sub arg3, arg3, #256 145 146 // at this section of the code, there is 64*x+y (0<=y<64) bytes of 147 // buffer. The _fold_64_B_loop will fold 64B at a time 148 // until we have 64+y Bytes of buffer 149 150 151 // fold 64B at a time. This section of the code folds 4 vector 152 // registers in parallel 153_fold_64_B_loop: 154 155 .macro fold64, reg1, reg2 156 ldp q11, q12, [arg2], #0x20 157 158 pmull2 v8.1q, \reg1\().2d, v10.2d 159 pmull \reg1\().1q, \reg1\().1d, v10.1d 160 161CPU_LE( rev64 v11.16b, v11.16b ) 162CPU_LE( rev64 v12.16b, v12.16b ) 163 164 pmull2 v9.1q, \reg2\().2d, v10.2d 165 pmull \reg2\().1q, \reg2\().1d, v10.1d 166 167CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) 168CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) 169 170 eor \reg1\().16b, \reg1\().16b, v8.16b 171 eor \reg2\().16b, \reg2\().16b, v9.16b 172 eor \reg1\().16b, \reg1\().16b, v11.16b 173 eor \reg2\().16b, \reg2\().16b, v12.16b 174 .endm 175 176 fold64 v0, v1 177 fold64 v2, v3 178 fold64 v4, v5 179 fold64 v6, v7 180 181 subs arg3, arg3, #128 182 183 // check if there is another 64B in the buffer to be able to fold 184 b.lt _fold_64_B_end 185 186 if_will_cond_yield_neon 187 stp q0, q1, [sp, #.Lframe_local_offset] 188 stp q2, q3, [sp, #.Lframe_local_offset + 32] 189 stp q4, q5, [sp, #.Lframe_local_offset + 64] 190 stp q6, q7, [sp, #.Lframe_local_offset + 96] 191 do_cond_yield_neon 192 ldp q0, q1, [sp, #.Lframe_local_offset] 193 ldp q2, q3, [sp, #.Lframe_local_offset + 32] 194 ldp q4, q5, [sp, #.Lframe_local_offset + 64] 195 ldp q6, q7, [sp, #.Lframe_local_offset + 96] 196 ldr_l q10, rk3, x8 197 movi vzr.16b, #0 // init zero register 198 endif_yield_neon 199 200 b _fold_64_B_loop 201 202_fold_64_B_end: 203 // at this point, the buffer pointer is pointing at the last y Bytes 204 // of the buffer the 64B of folded data is in 4 of the vector 205 // registers: v0, v1, v2, v3 206 207 // fold the 8 vector registers to 1 vector register with different 208 // constants 209 210 ldr_l q10, rk9, x8 211 212 .macro fold16, reg, rk 213 pmull v8.1q, \reg\().1d, v10.1d 214 pmull2 \reg\().1q, \reg\().2d, v10.2d 215 .ifnb \rk 216 ldr_l q10, \rk, x8 217 .endif 218 eor v7.16b, v7.16b, v8.16b 219 eor v7.16b, v7.16b, \reg\().16b 220 .endm 221 222 fold16 v0, rk11 223 fold16 v1, rk13 224 fold16 v2, rk15 225 fold16 v3, rk17 226 fold16 v4, rk19 227 fold16 v5, rk1 228 fold16 v6 229 230 // instead of 64, we add 48 to the loop counter to save 1 instruction 231 // from the loop instead of a cmp instruction, we use the negative 232 // flag with the jl instruction 233 adds arg3, arg3, #(128-16) 234 b.lt _final_reduction_for_128 235 236 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 237 // and the rest is in memory. We can fold 16 bytes at a time if y>=16 238 // continue folding 16B at a time 239 240_16B_reduction_loop: 241 pmull v8.1q, v7.1d, v10.1d 242 pmull2 v7.1q, v7.2d, v10.2d 243 eor v7.16b, v7.16b, v8.16b 244 245 ldr q0, [arg2], #16 246CPU_LE( rev64 v0.16b, v0.16b ) 247CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 248 eor v7.16b, v7.16b, v0.16b 249 subs arg3, arg3, #16 250 251 // instead of a cmp instruction, we utilize the flags with the 252 // jge instruction equivalent of: cmp arg3, 16-16 253 // check if there is any more 16B in the buffer to be able to fold 254 b.ge _16B_reduction_loop 255 256 // now we have 16+z bytes left to reduce, where 0<= z < 16. 257 // first, we reduce the data in the xmm7 register 258 259_final_reduction_for_128: 260 // check if any more data to fold. If not, compute the CRC of 261 // the final 128 bits 262 adds arg3, arg3, #16 263 b.eq _128_done 264 265 // here we are getting data that is less than 16 bytes. 266 // since we know that there was data before the pointer, we can 267 // offset the input pointer before the actual point, to receive 268 // exactly 16 bytes. after that the registers need to be adjusted. 269_get_last_two_regs: 270 add arg2, arg2, arg3 271 ldr q1, [arg2, #-16] 272CPU_LE( rev64 v1.16b, v1.16b ) 273CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 274 275 // get rid of the extra data that was loaded before 276 // load the shift constant 277 adr_l x4, tbl_shf_table + 16 278 sub x4, x4, arg3 279 ld1 {v0.16b}, [x4] 280 281 // shift v2 to the left by arg3 bytes 282 tbl v2.16b, {v7.16b}, v0.16b 283 284 // shift v7 to the right by 16-arg3 bytes 285 movi v9.16b, #0x80 286 eor v0.16b, v0.16b, v9.16b 287 tbl v7.16b, {v7.16b}, v0.16b 288 289 // blend 290 sshr v0.16b, v0.16b, #7 // convert to 8-bit mask 291 bsl v0.16b, v2.16b, v1.16b 292 293 // fold 16 Bytes 294 pmull v8.1q, v7.1d, v10.1d 295 pmull2 v7.1q, v7.2d, v10.2d 296 eor v7.16b, v7.16b, v8.16b 297 eor v7.16b, v7.16b, v0.16b 298 299_128_done: 300 // compute crc of a 128-bit value 301 ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 302 303 // 64b fold 304 ext v0.16b, vzr.16b, v7.16b, #8 305 mov v7.d[0], v7.d[1] 306 pmull v7.1q, v7.1d, v10.1d 307 eor v7.16b, v7.16b, v0.16b 308 309 // 32b fold 310 ext v0.16b, v7.16b, vzr.16b, #4 311 mov v7.s[3], vzr.s[0] 312 pmull2 v0.1q, v0.2d, v10.2d 313 eor v7.16b, v7.16b, v0.16b 314 315 // barrett reduction 316_barrett: 317 ldr_l q10, rk7, x8 318 mov v0.d[0], v7.d[1] 319 320 pmull v0.1q, v0.1d, v10.1d 321 ext v0.16b, vzr.16b, v0.16b, #12 322 pmull2 v0.1q, v0.2d, v10.2d 323 ext v0.16b, vzr.16b, v0.16b, #12 324 eor v7.16b, v7.16b, v0.16b 325 mov w0, v7.s[1] 326 327_cleanup: 328 // scale the result back to 16 bits 329 lsr x0, x0, #16 330 frame_pop 331 ret 332 333_less_than_128: 334 cbz arg3, _cleanup 335 336 movi v0.16b, #0 337 mov v0.s[3], arg1_low32 // get the initial crc value 338 339 ldr q7, [arg2], #0x10 340CPU_LE( rev64 v7.16b, v7.16b ) 341CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 342 eor v7.16b, v7.16b, v0.16b // xor the initial crc value 343 344 cmp arg3, #16 345 b.eq _128_done // exactly 16 left 346 b.lt _less_than_16_left 347 348 ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 349 350 // update the counter. subtract 32 instead of 16 to save one 351 // instruction from the loop 352 subs arg3, arg3, #32 353 b.ge _16B_reduction_loop 354 355 add arg3, arg3, #16 356 b _get_last_two_regs 357 358_less_than_16_left: 359 // shl r9, 4 360 adr_l x0, tbl_shf_table + 16 361 sub x0, x0, arg3 362 ld1 {v0.16b}, [x0] 363 movi v9.16b, #0x80 364 eor v0.16b, v0.16b, v9.16b 365 tbl v7.16b, {v7.16b}, v0.16b 366 b _128_done 367ENDPROC(crc_t10dif_pmull) 368 369// precomputed constants 370// these constants are precomputed from the poly: 371// 0x8bb70000 (0x8bb7 scaled to 32 bits) 372 .section ".rodata", "a" 373 .align 4 374// Q = 0x18BB70000 375// rk1 = 2^(32*3) mod Q << 32 376// rk2 = 2^(32*5) mod Q << 32 377// rk3 = 2^(32*15) mod Q << 32 378// rk4 = 2^(32*17) mod Q << 32 379// rk5 = 2^(32*3) mod Q << 32 380// rk6 = 2^(32*2) mod Q << 32 381// rk7 = floor(2^64/Q) 382// rk8 = Q 383 384rk1: .octa 0x06df0000000000002d56000000000000 385rk3: .octa 0x7cf50000000000009d9d000000000000 386rk5: .octa 0x13680000000000002d56000000000000 387rk7: .octa 0x000000018bb7000000000001f65a57f8 388rk9: .octa 0xbfd6000000000000ceae000000000000 389rk11: .octa 0x713c0000000000001e16000000000000 390rk13: .octa 0x80a6000000000000f7f9000000000000 391rk15: .octa 0xe658000000000000044c000000000000 392rk17: .octa 0xa497000000000000ad18000000000000 393rk19: .octa 0xe7b50000000000006ee3000000000000 394 395tbl_shf_table: 396// use these values for shift constants for the tbl/tbx instruction 397// different alignments result in values as shown: 398// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 399// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 400// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 401// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 402// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 403// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 404// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 405// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 406// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 407// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 408// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 409// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 410// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 411// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 412// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 413 414 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 415 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 416 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 417 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 418