1######################################################################## 2# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 3# 4# Copyright (c) 2013, Intel Corporation 5# 6# Authors: 7# Erdinc Ozturk <erdinc.ozturk@intel.com> 8# Vinodh Gopal <vinodh.gopal@intel.com> 9# James Guilford <james.guilford@intel.com> 10# Tim Chen <tim.c.chen@linux.intel.com> 11# 12# This software is available to you under a choice of one of two 13# licenses. You may choose to be licensed under the terms of the GNU 14# General Public License (GPL) Version 2, available from the file 15# COPYING in the main directory of this source tree, or the 16# OpenIB.org BSD license below: 17# 18# Redistribution and use in source and binary forms, with or without 19# modification, are permitted provided that the following conditions are 20# met: 21# 22# * Redistributions of source code must retain the above copyright 23# notice, this list of conditions and the following disclaimer. 24# 25# * Redistributions in binary form must reproduce the above copyright 26# notice, this list of conditions and the following disclaimer in the 27# documentation and/or other materials provided with the 28# distribution. 29# 30# * Neither the name of the Intel Corporation nor the names of its 31# contributors may be used to endorse or promote products derived from 32# this software without specific prior written permission. 33# 34# 35# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 36# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 38# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 39# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 40# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 41# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 42# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 43# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 44# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 45# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46######################################################################## 47# Function API: 48# UINT16 crc_t10dif_pcl( 49# UINT16 init_crc, //initial CRC value, 16 bits 50# const unsigned char *buf, //buffer pointer to calculate CRC on 51# UINT64 len //buffer length in bytes (64-bit data) 52# ); 53# 54# Reference paper titled "Fast CRC Computation for Generic 55# Polynomials Using PCLMULQDQ Instruction" 56# URL: http://www.intel.com/content/dam/www/public/us/en/documents 57# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 58# 59# 60 61#include <linux/linkage.h> 62 63.text 64 65#define arg1 %rdi 66#define arg2 %rsi 67#define arg3 %rdx 68 69#define arg1_low32 %edi 70 71ENTRY(crc_t10dif_pcl) 72.align 16 73 74 # adjust the 16-bit initial_crc value, scale it to 32 bits 75 shl $16, arg1_low32 76 77 # Allocate Stack Space 78 mov %rsp, %rcx 79 sub $16*2, %rsp 80 # align stack to 16 byte boundary 81 and $~(0x10 - 1), %rsp 82 83 # check if smaller than 256 84 cmp $256, arg3 85 86 # for sizes less than 128, we can't fold 64B at a time... 87 jl _less_than_128 88 89 90 # load the initial crc value 91 movd arg1_low32, %xmm10 # initial crc 92 93 # crc value does not need to be byte-reflected, but it needs 94 # to be moved to the high part of the register. 95 # because data will be byte-reflected and will align with 96 # initial crc at correct place. 97 pslldq $12, %xmm10 98 99 movdqa SHUF_MASK(%rip), %xmm11 100 # receive the initial 64B data, xor the initial crc value 101 movdqu 16*0(arg2), %xmm0 102 movdqu 16*1(arg2), %xmm1 103 movdqu 16*2(arg2), %xmm2 104 movdqu 16*3(arg2), %xmm3 105 movdqu 16*4(arg2), %xmm4 106 movdqu 16*5(arg2), %xmm5 107 movdqu 16*6(arg2), %xmm6 108 movdqu 16*7(arg2), %xmm7 109 110 pshufb %xmm11, %xmm0 111 # XOR the initial_crc value 112 pxor %xmm10, %xmm0 113 pshufb %xmm11, %xmm1 114 pshufb %xmm11, %xmm2 115 pshufb %xmm11, %xmm3 116 pshufb %xmm11, %xmm4 117 pshufb %xmm11, %xmm5 118 pshufb %xmm11, %xmm6 119 pshufb %xmm11, %xmm7 120 121 movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 122 #imm value of pclmulqdq instruction 123 #will determine which constant to use 124 125 ################################################################# 126 # we subtract 256 instead of 128 to save one instruction from the loop 127 sub $256, arg3 128 129 # at this section of the code, there is 64*x+y (0<=y<64) bytes of 130 # buffer. The _fold_64_B_loop will fold 64B at a time 131 # until we have 64+y Bytes of buffer 132 133 134 # fold 64B at a time. This section of the code folds 4 xmm 135 # registers in parallel 136_fold_64_B_loop: 137 138 # update the buffer pointer 139 add $128, arg2 # buf += 64# 140 141 movdqu 16*0(arg2), %xmm9 142 movdqu 16*1(arg2), %xmm12 143 pshufb %xmm11, %xmm9 144 pshufb %xmm11, %xmm12 145 movdqa %xmm0, %xmm8 146 movdqa %xmm1, %xmm13 147 pclmulqdq $0x0 , %xmm10, %xmm0 148 pclmulqdq $0x11, %xmm10, %xmm8 149 pclmulqdq $0x0 , %xmm10, %xmm1 150 pclmulqdq $0x11, %xmm10, %xmm13 151 pxor %xmm9 , %xmm0 152 xorps %xmm8 , %xmm0 153 pxor %xmm12, %xmm1 154 xorps %xmm13, %xmm1 155 156 movdqu 16*2(arg2), %xmm9 157 movdqu 16*3(arg2), %xmm12 158 pshufb %xmm11, %xmm9 159 pshufb %xmm11, %xmm12 160 movdqa %xmm2, %xmm8 161 movdqa %xmm3, %xmm13 162 pclmulqdq $0x0, %xmm10, %xmm2 163 pclmulqdq $0x11, %xmm10, %xmm8 164 pclmulqdq $0x0, %xmm10, %xmm3 165 pclmulqdq $0x11, %xmm10, %xmm13 166 pxor %xmm9 , %xmm2 167 xorps %xmm8 , %xmm2 168 pxor %xmm12, %xmm3 169 xorps %xmm13, %xmm3 170 171 movdqu 16*4(arg2), %xmm9 172 movdqu 16*5(arg2), %xmm12 173 pshufb %xmm11, %xmm9 174 pshufb %xmm11, %xmm12 175 movdqa %xmm4, %xmm8 176 movdqa %xmm5, %xmm13 177 pclmulqdq $0x0, %xmm10, %xmm4 178 pclmulqdq $0x11, %xmm10, %xmm8 179 pclmulqdq $0x0, %xmm10, %xmm5 180 pclmulqdq $0x11, %xmm10, %xmm13 181 pxor %xmm9 , %xmm4 182 xorps %xmm8 , %xmm4 183 pxor %xmm12, %xmm5 184 xorps %xmm13, %xmm5 185 186 movdqu 16*6(arg2), %xmm9 187 movdqu 16*7(arg2), %xmm12 188 pshufb %xmm11, %xmm9 189 pshufb %xmm11, %xmm12 190 movdqa %xmm6 , %xmm8 191 movdqa %xmm7 , %xmm13 192 pclmulqdq $0x0 , %xmm10, %xmm6 193 pclmulqdq $0x11, %xmm10, %xmm8 194 pclmulqdq $0x0 , %xmm10, %xmm7 195 pclmulqdq $0x11, %xmm10, %xmm13 196 pxor %xmm9 , %xmm6 197 xorps %xmm8 , %xmm6 198 pxor %xmm12, %xmm7 199 xorps %xmm13, %xmm7 200 201 sub $128, arg3 202 203 # check if there is another 64B in the buffer to be able to fold 204 jge _fold_64_B_loop 205 ################################################################## 206 207 208 add $128, arg2 209 # at this point, the buffer pointer is pointing at the last y Bytes 210 # of the buffer the 64B of folded data is in 4 of the xmm 211 # registers: xmm0, xmm1, xmm2, xmm3 212 213 214 # fold the 8 xmm registers to 1 xmm register with different constants 215 216 movdqa rk9(%rip), %xmm10 217 movdqa %xmm0, %xmm8 218 pclmulqdq $0x11, %xmm10, %xmm0 219 pclmulqdq $0x0 , %xmm10, %xmm8 220 pxor %xmm8, %xmm7 221 xorps %xmm0, %xmm7 222 223 movdqa rk11(%rip), %xmm10 224 movdqa %xmm1, %xmm8 225 pclmulqdq $0x11, %xmm10, %xmm1 226 pclmulqdq $0x0 , %xmm10, %xmm8 227 pxor %xmm8, %xmm7 228 xorps %xmm1, %xmm7 229 230 movdqa rk13(%rip), %xmm10 231 movdqa %xmm2, %xmm8 232 pclmulqdq $0x11, %xmm10, %xmm2 233 pclmulqdq $0x0 , %xmm10, %xmm8 234 pxor %xmm8, %xmm7 235 pxor %xmm2, %xmm7 236 237 movdqa rk15(%rip), %xmm10 238 movdqa %xmm3, %xmm8 239 pclmulqdq $0x11, %xmm10, %xmm3 240 pclmulqdq $0x0 , %xmm10, %xmm8 241 pxor %xmm8, %xmm7 242 xorps %xmm3, %xmm7 243 244 movdqa rk17(%rip), %xmm10 245 movdqa %xmm4, %xmm8 246 pclmulqdq $0x11, %xmm10, %xmm4 247 pclmulqdq $0x0 , %xmm10, %xmm8 248 pxor %xmm8, %xmm7 249 pxor %xmm4, %xmm7 250 251 movdqa rk19(%rip), %xmm10 252 movdqa %xmm5, %xmm8 253 pclmulqdq $0x11, %xmm10, %xmm5 254 pclmulqdq $0x0 , %xmm10, %xmm8 255 pxor %xmm8, %xmm7 256 xorps %xmm5, %xmm7 257 258 movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 259 #imm value of pclmulqdq instruction 260 #will determine which constant to use 261 movdqa %xmm6, %xmm8 262 pclmulqdq $0x11, %xmm10, %xmm6 263 pclmulqdq $0x0 , %xmm10, %xmm8 264 pxor %xmm8, %xmm7 265 pxor %xmm6, %xmm7 266 267 268 # instead of 64, we add 48 to the loop counter to save 1 instruction 269 # from the loop instead of a cmp instruction, we use the negative 270 # flag with the jl instruction 271 add $128-16, arg3 272 jl _final_reduction_for_128 273 274 # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 275 # and the rest is in memory. We can fold 16 bytes at a time if y>=16 276 # continue folding 16B at a time 277 278_16B_reduction_loop: 279 movdqa %xmm7, %xmm8 280 pclmulqdq $0x11, %xmm10, %xmm7 281 pclmulqdq $0x0 , %xmm10, %xmm8 282 pxor %xmm8, %xmm7 283 movdqu (arg2), %xmm0 284 pshufb %xmm11, %xmm0 285 pxor %xmm0 , %xmm7 286 add $16, arg2 287 sub $16, arg3 288 # instead of a cmp instruction, we utilize the flags with the 289 # jge instruction equivalent of: cmp arg3, 16-16 290 # check if there is any more 16B in the buffer to be able to fold 291 jge _16B_reduction_loop 292 293 #now we have 16+z bytes left to reduce, where 0<= z < 16. 294 #first, we reduce the data in the xmm7 register 295 296 297_final_reduction_for_128: 298 # check if any more data to fold. If not, compute the CRC of 299 # the final 128 bits 300 add $16, arg3 301 je _128_done 302 303 # here we are getting data that is less than 16 bytes. 304 # since we know that there was data before the pointer, we can 305 # offset the input pointer before the actual point, to receive 306 # exactly 16 bytes. after that the registers need to be adjusted. 307_get_last_two_xmms: 308 movdqa %xmm7, %xmm2 309 310 movdqu -16(arg2, arg3), %xmm1 311 pshufb %xmm11, %xmm1 312 313 # get rid of the extra data that was loaded before 314 # load the shift constant 315 lea pshufb_shf_table+16(%rip), %rax 316 sub arg3, %rax 317 movdqu (%rax), %xmm0 318 319 # shift xmm2 to the left by arg3 bytes 320 pshufb %xmm0, %xmm2 321 322 # shift xmm7 to the right by 16-arg3 bytes 323 pxor mask1(%rip), %xmm0 324 pshufb %xmm0, %xmm7 325 pblendvb %xmm2, %xmm1 #xmm0 is implicit 326 327 # fold 16 Bytes 328 movdqa %xmm1, %xmm2 329 movdqa %xmm7, %xmm8 330 pclmulqdq $0x11, %xmm10, %xmm7 331 pclmulqdq $0x0 , %xmm10, %xmm8 332 pxor %xmm8, %xmm7 333 pxor %xmm2, %xmm7 334 335_128_done: 336 # compute crc of a 128-bit value 337 movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 338 movdqa %xmm7, %xmm0 339 340 #64b fold 341 pclmulqdq $0x1, %xmm10, %xmm7 342 pslldq $8 , %xmm0 343 pxor %xmm0, %xmm7 344 345 #32b fold 346 movdqa %xmm7, %xmm0 347 348 pand mask2(%rip), %xmm0 349 350 psrldq $12, %xmm7 351 pclmulqdq $0x10, %xmm10, %xmm7 352 pxor %xmm0, %xmm7 353 354 #barrett reduction 355_barrett: 356 movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 357 movdqa %xmm7, %xmm0 358 pclmulqdq $0x01, %xmm10, %xmm7 359 pslldq $4, %xmm7 360 pclmulqdq $0x11, %xmm10, %xmm7 361 362 pslldq $4, %xmm7 363 pxor %xmm0, %xmm7 364 pextrd $1, %xmm7, %eax 365 366_cleanup: 367 # scale the result back to 16 bits 368 shr $16, %eax 369 mov %rcx, %rsp 370 ret 371 372######################################################################## 373 374.align 16 375_less_than_128: 376 377 # check if there is enough buffer to be able to fold 16B at a time 378 cmp $32, arg3 379 jl _less_than_32 380 movdqa SHUF_MASK(%rip), %xmm11 381 382 # now if there is, load the constants 383 movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 384 385 movd arg1_low32, %xmm0 # get the initial crc value 386 pslldq $12, %xmm0 # align it to its correct place 387 movdqu (arg2), %xmm7 # load the plaintext 388 pshufb %xmm11, %xmm7 # byte-reflect the plaintext 389 pxor %xmm0, %xmm7 390 391 392 # update the buffer pointer 393 add $16, arg2 394 395 # update the counter. subtract 32 instead of 16 to save one 396 # instruction from the loop 397 sub $32, arg3 398 399 jmp _16B_reduction_loop 400 401 402.align 16 403_less_than_32: 404 # mov initial crc to the return value. this is necessary for 405 # zero-length buffers. 406 mov arg1_low32, %eax 407 test arg3, arg3 408 je _cleanup 409 410 movdqa SHUF_MASK(%rip), %xmm11 411 412 movd arg1_low32, %xmm0 # get the initial crc value 413 pslldq $12, %xmm0 # align it to its correct place 414 415 cmp $16, arg3 416 je _exact_16_left 417 jl _less_than_16_left 418 419 movdqu (arg2), %xmm7 # load the plaintext 420 pshufb %xmm11, %xmm7 # byte-reflect the plaintext 421 pxor %xmm0 , %xmm7 # xor the initial crc value 422 add $16, arg2 423 sub $16, arg3 424 movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 425 jmp _get_last_two_xmms 426 427 428.align 16 429_less_than_16_left: 430 # use stack space to load data less than 16 bytes, zero-out 431 # the 16B in memory first. 432 433 pxor %xmm1, %xmm1 434 mov %rsp, %r11 435 movdqa %xmm1, (%r11) 436 437 cmp $4, arg3 438 jl _only_less_than_4 439 440 # backup the counter value 441 mov arg3, %r9 442 cmp $8, arg3 443 jl _less_than_8_left 444 445 # load 8 Bytes 446 mov (arg2), %rax 447 mov %rax, (%r11) 448 add $8, %r11 449 sub $8, arg3 450 add $8, arg2 451_less_than_8_left: 452 453 cmp $4, arg3 454 jl _less_than_4_left 455 456 # load 4 Bytes 457 mov (arg2), %eax 458 mov %eax, (%r11) 459 add $4, %r11 460 sub $4, arg3 461 add $4, arg2 462_less_than_4_left: 463 464 cmp $2, arg3 465 jl _less_than_2_left 466 467 # load 2 Bytes 468 mov (arg2), %ax 469 mov %ax, (%r11) 470 add $2, %r11 471 sub $2, arg3 472 add $2, arg2 473_less_than_2_left: 474 cmp $1, arg3 475 jl _zero_left 476 477 # load 1 Byte 478 mov (arg2), %al 479 mov %al, (%r11) 480_zero_left: 481 movdqa (%rsp), %xmm7 482 pshufb %xmm11, %xmm7 483 pxor %xmm0 , %xmm7 # xor the initial crc value 484 485 # shl r9, 4 486 lea pshufb_shf_table+16(%rip), %rax 487 sub %r9, %rax 488 movdqu (%rax), %xmm0 489 pxor mask1(%rip), %xmm0 490 491 pshufb %xmm0, %xmm7 492 jmp _128_done 493 494.align 16 495_exact_16_left: 496 movdqu (arg2), %xmm7 497 pshufb %xmm11, %xmm7 498 pxor %xmm0 , %xmm7 # xor the initial crc value 499 500 jmp _128_done 501 502_only_less_than_4: 503 cmp $3, arg3 504 jl _only_less_than_3 505 506 # load 3 Bytes 507 mov (arg2), %al 508 mov %al, (%r11) 509 510 mov 1(arg2), %al 511 mov %al, 1(%r11) 512 513 mov 2(arg2), %al 514 mov %al, 2(%r11) 515 516 movdqa (%rsp), %xmm7 517 pshufb %xmm11, %xmm7 518 pxor %xmm0 , %xmm7 # xor the initial crc value 519 520 psrldq $5, %xmm7 521 522 jmp _barrett 523_only_less_than_3: 524 cmp $2, arg3 525 jl _only_less_than_2 526 527 # load 2 Bytes 528 mov (arg2), %al 529 mov %al, (%r11) 530 531 mov 1(arg2), %al 532 mov %al, 1(%r11) 533 534 movdqa (%rsp), %xmm7 535 pshufb %xmm11, %xmm7 536 pxor %xmm0 , %xmm7 # xor the initial crc value 537 538 psrldq $6, %xmm7 539 540 jmp _barrett 541_only_less_than_2: 542 543 # load 1 Byte 544 mov (arg2), %al 545 mov %al, (%r11) 546 547 movdqa (%rsp), %xmm7 548 pshufb %xmm11, %xmm7 549 pxor %xmm0 , %xmm7 # xor the initial crc value 550 551 psrldq $7, %xmm7 552 553 jmp _barrett 554 555ENDPROC(crc_t10dif_pcl) 556 557.section .rodata, "a", @progbits 558.align 16 559# precomputed constants 560# these constants are precomputed from the poly: 561# 0x8bb70000 (0x8bb7 scaled to 32 bits) 562# Q = 0x18BB70000 563# rk1 = 2^(32*3) mod Q << 32 564# rk2 = 2^(32*5) mod Q << 32 565# rk3 = 2^(32*15) mod Q << 32 566# rk4 = 2^(32*17) mod Q << 32 567# rk5 = 2^(32*3) mod Q << 32 568# rk6 = 2^(32*2) mod Q << 32 569# rk7 = floor(2^64/Q) 570# rk8 = Q 571rk1: 572.quad 0x2d56000000000000 573rk2: 574.quad 0x06df000000000000 575rk3: 576.quad 0x9d9d000000000000 577rk4: 578.quad 0x7cf5000000000000 579rk5: 580.quad 0x2d56000000000000 581rk6: 582.quad 0x1368000000000000 583rk7: 584.quad 0x00000001f65a57f8 585rk8: 586.quad 0x000000018bb70000 587 588rk9: 589.quad 0xceae000000000000 590rk10: 591.quad 0xbfd6000000000000 592rk11: 593.quad 0x1e16000000000000 594rk12: 595.quad 0x713c000000000000 596rk13: 597.quad 0xf7f9000000000000 598rk14: 599.quad 0x80a6000000000000 600rk15: 601.quad 0x044c000000000000 602rk16: 603.quad 0xe658000000000000 604rk17: 605.quad 0xad18000000000000 606rk18: 607.quad 0xa497000000000000 608rk19: 609.quad 0x6ee3000000000000 610rk20: 611.quad 0xe7b5000000000000 612 613 614 615.section .rodata.cst16.mask1, "aM", @progbits, 16 616.align 16 617mask1: 618.octa 0x80808080808080808080808080808080 619 620.section .rodata.cst16.mask2, "aM", @progbits, 16 621.align 16 622mask2: 623.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF 624 625.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 626.align 16 627SHUF_MASK: 628.octa 0x000102030405060708090A0B0C0D0E0F 629 630.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32 631.align 32 632pshufb_shf_table: 633# use these values for shift constants for the pshufb instruction 634# different alignments result in values as shown: 635# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 636# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 637# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 638# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 639# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 640# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 641# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 642# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 643# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 644# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 645# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 646# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 647# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 648# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 649# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 650.octa 0x8f8e8d8c8b8a89888786858483828100 651.octa 0x000e0d0c0b0a09080706050403020100 652