168411521SHerbert Xu######################################################################## 268411521SHerbert Xu# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 368411521SHerbert Xu# 468411521SHerbert Xu# Copyright (c) 2013, Intel Corporation 568411521SHerbert Xu# 668411521SHerbert Xu# Authors: 768411521SHerbert Xu# Erdinc Ozturk <erdinc.ozturk@intel.com> 868411521SHerbert Xu# Vinodh Gopal <vinodh.gopal@intel.com> 968411521SHerbert Xu# James Guilford <james.guilford@intel.com> 1068411521SHerbert Xu# Tim Chen <tim.c.chen@linux.intel.com> 1168411521SHerbert Xu# 1268411521SHerbert Xu# This software is available to you under a choice of one of two 1368411521SHerbert Xu# licenses. You may choose to be licensed under the terms of the GNU 1468411521SHerbert Xu# General Public License (GPL) Version 2, available from the file 1568411521SHerbert Xu# COPYING in the main directory of this source tree, or the 1668411521SHerbert Xu# OpenIB.org BSD license below: 1768411521SHerbert Xu# 1868411521SHerbert Xu# Redistribution and use in source and binary forms, with or without 1968411521SHerbert Xu# modification, are permitted provided that the following conditions are 2068411521SHerbert Xu# met: 2168411521SHerbert Xu# 2268411521SHerbert Xu# * Redistributions of source code must retain the above copyright 2368411521SHerbert Xu# notice, this list of conditions and the following disclaimer. 2468411521SHerbert Xu# 2568411521SHerbert Xu# * Redistributions in binary form must reproduce the above copyright 2668411521SHerbert Xu# notice, this list of conditions and the following disclaimer in the 2768411521SHerbert Xu# documentation and/or other materials provided with the 2868411521SHerbert Xu# distribution. 2968411521SHerbert Xu# 3068411521SHerbert Xu# * Neither the name of the Intel Corporation nor the names of its 3168411521SHerbert Xu# contributors may be used to endorse or promote products derived from 3268411521SHerbert Xu# this software without specific prior written permission. 3368411521SHerbert Xu# 3468411521SHerbert Xu# 3568411521SHerbert Xu# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 3668411521SHerbert Xu# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 3768411521SHerbert Xu# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 3868411521SHerbert Xu# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 3968411521SHerbert Xu# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 4068411521SHerbert Xu# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 4168411521SHerbert Xu# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 4268411521SHerbert Xu# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 4368411521SHerbert Xu# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 4468411521SHerbert Xu# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 4568411521SHerbert Xu# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 4668411521SHerbert Xu# 4768411521SHerbert Xu# Reference paper titled "Fast CRC Computation for Generic 4868411521SHerbert Xu# Polynomials Using PCLMULQDQ Instruction" 4968411521SHerbert Xu# URL: http://www.intel.com/content/dam/www/public/us/en/documents 5068411521SHerbert Xu# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 5168411521SHerbert Xu# 5268411521SHerbert Xu 5368411521SHerbert Xu#include <linux/linkage.h> 5468411521SHerbert Xu 5568411521SHerbert Xu.text 5668411521SHerbert Xu 570974037fSEric Biggers#define init_crc %edi 580974037fSEric Biggers#define buf %rsi 590974037fSEric Biggers#define len %rdx 6068411521SHerbert Xu 610974037fSEric Biggers#define FOLD_CONSTS %xmm10 620974037fSEric Biggers#define BSWAP_MASK %xmm11 6368411521SHerbert Xu 640974037fSEric Biggers# Fold reg1, reg2 into the next 32 data bytes, storing the result back into 650974037fSEric Biggers# reg1, reg2. 660974037fSEric Biggers.macro fold_32_bytes offset, reg1, reg2 670974037fSEric Biggers movdqu \offset(buf), %xmm9 680974037fSEric Biggers movdqu \offset+16(buf), %xmm12 690974037fSEric Biggers pshufb BSWAP_MASK, %xmm9 700974037fSEric Biggers pshufb BSWAP_MASK, %xmm12 710974037fSEric Biggers movdqa \reg1, %xmm8 720974037fSEric Biggers movdqa \reg2, %xmm13 730974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, \reg1 740974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, %xmm8 750974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, \reg2 760974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, %xmm13 770974037fSEric Biggers pxor %xmm9 , \reg1 780974037fSEric Biggers xorps %xmm8 , \reg1 790974037fSEric Biggers pxor %xmm12, \reg2 800974037fSEric Biggers xorps %xmm13, \reg2 810974037fSEric Biggers.endm 820974037fSEric Biggers 830974037fSEric Biggers# Fold src_reg into dst_reg. 840974037fSEric Biggers.macro fold_16_bytes src_reg, dst_reg 850974037fSEric Biggers movdqa \src_reg, %xmm8 860974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, \src_reg 870974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, %xmm8 880974037fSEric Biggers pxor %xmm8, \dst_reg 890974037fSEric Biggers xorps \src_reg, \dst_reg 900974037fSEric Biggers.endm 910974037fSEric Biggers 920974037fSEric Biggers# 930974037fSEric Biggers# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); 940974037fSEric Biggers# 950974037fSEric Biggers# Assumes len >= 16. 960974037fSEric Biggers# 976dcc5627SJiri SlabySYM_FUNC_START(crc_t10dif_pcl) 9868411521SHerbert Xu 990974037fSEric Biggers movdqa .Lbswap_mask(%rip), BSWAP_MASK 10068411521SHerbert Xu 1010974037fSEric Biggers # For sizes less than 256 bytes, we can't fold 128 bytes at a time. 1020974037fSEric Biggers cmp $256, len 1030974037fSEric Biggers jl .Lless_than_256_bytes 10468411521SHerbert Xu 1050974037fSEric Biggers # Load the first 128 data bytes. Byte swapping is necessary to make the 1060974037fSEric Biggers # bit order match the polynomial coefficient order. 1070974037fSEric Biggers movdqu 16*0(buf), %xmm0 1080974037fSEric Biggers movdqu 16*1(buf), %xmm1 1090974037fSEric Biggers movdqu 16*2(buf), %xmm2 1100974037fSEric Biggers movdqu 16*3(buf), %xmm3 1110974037fSEric Biggers movdqu 16*4(buf), %xmm4 1120974037fSEric Biggers movdqu 16*5(buf), %xmm5 1130974037fSEric Biggers movdqu 16*6(buf), %xmm6 1140974037fSEric Biggers movdqu 16*7(buf), %xmm7 1150974037fSEric Biggers add $128, buf 1160974037fSEric Biggers pshufb BSWAP_MASK, %xmm0 1170974037fSEric Biggers pshufb BSWAP_MASK, %xmm1 1180974037fSEric Biggers pshufb BSWAP_MASK, %xmm2 1190974037fSEric Biggers pshufb BSWAP_MASK, %xmm3 1200974037fSEric Biggers pshufb BSWAP_MASK, %xmm4 1210974037fSEric Biggers pshufb BSWAP_MASK, %xmm5 1220974037fSEric Biggers pshufb BSWAP_MASK, %xmm6 1230974037fSEric Biggers pshufb BSWAP_MASK, %xmm7 12468411521SHerbert Xu 1250974037fSEric Biggers # XOR the first 16 data *bits* with the initial CRC value. 1260974037fSEric Biggers pxor %xmm8, %xmm8 1270974037fSEric Biggers pinsrw $7, init_crc, %xmm8 1280974037fSEric Biggers pxor %xmm8, %xmm0 12968411521SHerbert Xu 1300974037fSEric Biggers movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS 13168411521SHerbert Xu 1320974037fSEric Biggers # Subtract 128 for the 128 data bytes just consumed. Subtract another 1330974037fSEric Biggers # 128 to simplify the termination condition of the following loop. 1340974037fSEric Biggers sub $256, len 13568411521SHerbert Xu 1360974037fSEric Biggers # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 1370974037fSEric Biggers # bytes xmm0-7 into them, storing the result back into xmm0-7. 1380974037fSEric Biggers.Lfold_128_bytes_loop: 1390974037fSEric Biggers fold_32_bytes 0, %xmm0, %xmm1 1400974037fSEric Biggers fold_32_bytes 32, %xmm2, %xmm3 1410974037fSEric Biggers fold_32_bytes 64, %xmm4, %xmm5 1420974037fSEric Biggers fold_32_bytes 96, %xmm6, %xmm7 1430974037fSEric Biggers add $128, buf 1440974037fSEric Biggers sub $128, len 1450974037fSEric Biggers jge .Lfold_128_bytes_loop 14668411521SHerbert Xu 1470974037fSEric Biggers # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. 14868411521SHerbert Xu 1490974037fSEric Biggers # Fold across 64 bytes. 1500974037fSEric Biggers movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS 1510974037fSEric Biggers fold_16_bytes %xmm0, %xmm4 1520974037fSEric Biggers fold_16_bytes %xmm1, %xmm5 1530974037fSEric Biggers fold_16_bytes %xmm2, %xmm6 1540974037fSEric Biggers fold_16_bytes %xmm3, %xmm7 1550974037fSEric Biggers # Fold across 32 bytes. 1560974037fSEric Biggers movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS 1570974037fSEric Biggers fold_16_bytes %xmm4, %xmm6 1580974037fSEric Biggers fold_16_bytes %xmm5, %xmm7 1590974037fSEric Biggers # Fold across 16 bytes. 1600974037fSEric Biggers movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 1610974037fSEric Biggers fold_16_bytes %xmm6, %xmm7 16268411521SHerbert Xu 1630974037fSEric Biggers # Add 128 to get the correct number of data bytes remaining in 0...127 1640974037fSEric Biggers # (not counting xmm7), following the previous extra subtraction by 128. 1650974037fSEric Biggers # Then subtract 16 to simplify the termination condition of the 1660974037fSEric Biggers # following loop. 1670974037fSEric Biggers add $128-16, len 16868411521SHerbert Xu 1690974037fSEric Biggers # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes 1700974037fSEric Biggers # xmm7 into them, storing the result back into xmm7. 1710974037fSEric Biggers jl .Lfold_16_bytes_loop_done 1720974037fSEric Biggers.Lfold_16_bytes_loop: 17368411521SHerbert Xu movdqa %xmm7, %xmm8 1740974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, %xmm7 1750974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, %xmm8 17668411521SHerbert Xu pxor %xmm8, %xmm7 1770974037fSEric Biggers movdqu (buf), %xmm0 1780974037fSEric Biggers pshufb BSWAP_MASK, %xmm0 17968411521SHerbert Xu pxor %xmm0 , %xmm7 1800974037fSEric Biggers add $16, buf 1810974037fSEric Biggers sub $16, len 1820974037fSEric Biggers jge .Lfold_16_bytes_loop 18368411521SHerbert Xu 1840974037fSEric Biggers.Lfold_16_bytes_loop_done: 1850974037fSEric Biggers # Add 16 to get the correct number of data bytes remaining in 0...15 1860974037fSEric Biggers # (not counting xmm7), following the previous extra subtraction by 16. 1870974037fSEric Biggers add $16, len 1880974037fSEric Biggers je .Lreduce_final_16_bytes 18968411521SHerbert Xu 1900974037fSEric Biggers.Lhandle_partial_segment: 1910974037fSEric Biggers # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 1920974037fSEric Biggers # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do 1930974037fSEric Biggers # this without needing a fold constant for each possible 'len', redivide 1940974037fSEric Biggers # the bytes into a first chunk of 'len' bytes and a second chunk of 16 1950974037fSEric Biggers # bytes, then fold the first chunk into the second. 19668411521SHerbert Xu 19768411521SHerbert Xu movdqa %xmm7, %xmm2 19868411521SHerbert Xu 1990974037fSEric Biggers # xmm1 = last 16 original data bytes 2000974037fSEric Biggers movdqu -16(buf, len), %xmm1 2010974037fSEric Biggers pshufb BSWAP_MASK, %xmm1 20268411521SHerbert Xu 2030974037fSEric Biggers # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. 2040974037fSEric Biggers lea .Lbyteshift_table+16(%rip), %rax 2050974037fSEric Biggers sub len, %rax 20668411521SHerbert Xu movdqu (%rax), %xmm0 20768411521SHerbert Xu pshufb %xmm0, %xmm2 20868411521SHerbert Xu 2090974037fSEric Biggers # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. 2100974037fSEric Biggers pxor .Lmask1(%rip), %xmm0 21168411521SHerbert Xu pshufb %xmm0, %xmm7 2120974037fSEric Biggers 2130974037fSEric Biggers # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), 2140974037fSEric Biggers # then '16-len' bytes from xmm2 (high-order bytes). 21568411521SHerbert Xu pblendvb %xmm2, %xmm1 #xmm0 is implicit 21668411521SHerbert Xu 2170974037fSEric Biggers # Fold the first chunk into the second chunk, storing the result in xmm7. 21868411521SHerbert Xu movdqa %xmm7, %xmm8 2190974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, %xmm7 2200974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, %xmm8 22168411521SHerbert Xu pxor %xmm8, %xmm7 2220974037fSEric Biggers pxor %xmm1, %xmm7 22368411521SHerbert Xu 2240974037fSEric Biggers.Lreduce_final_16_bytes: 2250974037fSEric Biggers # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC 2260974037fSEric Biggers 2270974037fSEric Biggers # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 2280974037fSEric Biggers movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS 2290974037fSEric Biggers 2300974037fSEric Biggers # Fold the high 64 bits into the low 64 bits, while also multiplying by 2310974037fSEric Biggers # x^64. This produces a 128-bit value congruent to x^64 * M(x) and 2320974037fSEric Biggers # whose low 48 bits are 0. 23368411521SHerbert Xu movdqa %xmm7, %xmm0 2340974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) 23568411521SHerbert Xu pslldq $8, %xmm0 2360974037fSEric Biggers pxor %xmm0, %xmm7 # + low bits * x^64 23768411521SHerbert Xu 2380974037fSEric Biggers # Fold the high 32 bits into the low 96 bits. This produces a 96-bit 2390974037fSEric Biggers # value congruent to x^64 * M(x) and whose low 48 bits are 0. 24068411521SHerbert Xu movdqa %xmm7, %xmm0 2410974037fSEric Biggers pand .Lmask2(%rip), %xmm0 # zero high 32 bits 2420974037fSEric Biggers psrldq $12, %xmm7 # extract high 32 bits 2430974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) 2440974037fSEric Biggers pxor %xmm0, %xmm7 # + low bits 24568411521SHerbert Xu 2460974037fSEric Biggers # Load G(x) and floor(x^48 / G(x)). 2470974037fSEric Biggers movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS 24868411521SHerbert Xu 2490974037fSEric Biggers # Use Barrett reduction to compute the final CRC value. 25068411521SHerbert Xu movdqa %xmm7, %xmm0 2510974037fSEric Biggers pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) 2520974037fSEric Biggers psrlq $32, %xmm7 # /= x^32 2530974037fSEric Biggers pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) 2540974037fSEric Biggers psrlq $48, %xmm0 2550974037fSEric Biggers pxor %xmm7, %xmm0 # + low 16 nonzero bits 2560974037fSEric Biggers # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. 25768411521SHerbert Xu 2580974037fSEric Biggers pextrw $0, %xmm0, %eax 259*f94909ceSPeter Zijlstra RET 26068411521SHerbert Xu 26168411521SHerbert Xu.align 16 2620974037fSEric Biggers.Lless_than_256_bytes: 2630974037fSEric Biggers # Checksumming a buffer of length 16...255 bytes 26468411521SHerbert Xu 2650974037fSEric Biggers # Load the first 16 data bytes. 2660974037fSEric Biggers movdqu (buf), %xmm7 2670974037fSEric Biggers pshufb BSWAP_MASK, %xmm7 2680974037fSEric Biggers add $16, buf 26968411521SHerbert Xu 2700974037fSEric Biggers # XOR the first 16 data *bits* with the initial CRC value. 2710974037fSEric Biggers pxor %xmm0, %xmm0 2720974037fSEric Biggers pinsrw $7, init_crc, %xmm0 27368411521SHerbert Xu pxor %xmm0, %xmm7 27468411521SHerbert Xu 2750974037fSEric Biggers movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 2760974037fSEric Biggers cmp $16, len 2770974037fSEric Biggers je .Lreduce_final_16_bytes # len == 16 2780974037fSEric Biggers sub $32, len 2790974037fSEric Biggers jge .Lfold_16_bytes_loop # 32 <= len <= 255 2800974037fSEric Biggers add $16, len 2810974037fSEric Biggers jmp .Lhandle_partial_segment # 17 <= len <= 31 2826dcc5627SJiri SlabySYM_FUNC_END(crc_t10dif_pcl) 28368411521SHerbert Xu 284e183914aSDenys Vlasenko.section .rodata, "a", @progbits 285e183914aSDenys Vlasenko.align 16 28668411521SHerbert Xu 2870974037fSEric Biggers# Fold constants precomputed from the polynomial 0x18bb7 2880974037fSEric Biggers# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 2890974037fSEric Biggers.Lfold_across_128_bytes_consts: 2900974037fSEric Biggers .quad 0x0000000000006123 # x^(8*128) mod G(x) 2910974037fSEric Biggers .quad 0x0000000000002295 # x^(8*128+64) mod G(x) 2920974037fSEric Biggers.Lfold_across_64_bytes_consts: 2930974037fSEric Biggers .quad 0x0000000000001069 # x^(4*128) mod G(x) 2940974037fSEric Biggers .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) 2950974037fSEric Biggers.Lfold_across_32_bytes_consts: 2960974037fSEric Biggers .quad 0x000000000000857d # x^(2*128) mod G(x) 2970974037fSEric Biggers .quad 0x0000000000007acc # x^(2*128+64) mod G(x) 2980974037fSEric Biggers.Lfold_across_16_bytes_consts: 2990974037fSEric Biggers .quad 0x000000000000a010 # x^(1*128) mod G(x) 3000974037fSEric Biggers .quad 0x0000000000001faa # x^(1*128+64) mod G(x) 3010974037fSEric Biggers.Lfinal_fold_consts: 3020974037fSEric Biggers .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) 3030974037fSEric Biggers .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) 3040974037fSEric Biggers.Lbarrett_reduction_consts: 3050974037fSEric Biggers .quad 0x0000000000018bb7 # G(x) 3060974037fSEric Biggers .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) 30768411521SHerbert Xu 308e183914aSDenys Vlasenko.section .rodata.cst16.mask1, "aM", @progbits, 16 309e183914aSDenys Vlasenko.align 16 3100974037fSEric Biggers.Lmask1: 31168411521SHerbert Xu .octa 0x80808080808080808080808080808080 312e183914aSDenys Vlasenko 313e183914aSDenys Vlasenko.section .rodata.cst16.mask2, "aM", @progbits, 16 314e183914aSDenys Vlasenko.align 16 3150974037fSEric Biggers.Lmask2: 31668411521SHerbert Xu .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF 31768411521SHerbert Xu 3180974037fSEric Biggers.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 319e183914aSDenys Vlasenko.align 16 3200974037fSEric Biggers.Lbswap_mask: 32168411521SHerbert Xu .octa 0x000102030405060708090A0B0C0D0E0F 32268411521SHerbert Xu 3230974037fSEric Biggers.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 3240974037fSEric Biggers.align 16 3250974037fSEric Biggers# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] 3260974037fSEric Biggers# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., 3270974037fSEric Biggers# 0x80} XOR the index vector to shift right by '16 - len' bytes. 3280974037fSEric Biggers.Lbyteshift_table: 3290974037fSEric Biggers .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 3300974037fSEric Biggers .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 3310974037fSEric Biggers .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 3320974037fSEric Biggers .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 333