168411521SHerbert Xu########################################################################
268411521SHerbert Xu# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
368411521SHerbert Xu#
468411521SHerbert Xu# Copyright (c) 2013, Intel Corporation
568411521SHerbert Xu#
668411521SHerbert Xu# Authors:
768411521SHerbert Xu#     Erdinc Ozturk <erdinc.ozturk@intel.com>
868411521SHerbert Xu#     Vinodh Gopal <vinodh.gopal@intel.com>
968411521SHerbert Xu#     James Guilford <james.guilford@intel.com>
1068411521SHerbert Xu#     Tim Chen <tim.c.chen@linux.intel.com>
1168411521SHerbert Xu#
1268411521SHerbert Xu# This software is available to you under a choice of one of two
1368411521SHerbert Xu# licenses.  You may choose to be licensed under the terms of the GNU
1468411521SHerbert Xu# General Public License (GPL) Version 2, available from the file
1568411521SHerbert Xu# COPYING in the main directory of this source tree, or the
1668411521SHerbert Xu# OpenIB.org BSD license below:
1768411521SHerbert Xu#
1868411521SHerbert Xu# Redistribution and use in source and binary forms, with or without
1968411521SHerbert Xu# modification, are permitted provided that the following conditions are
2068411521SHerbert Xu# met:
2168411521SHerbert Xu#
2268411521SHerbert Xu# * Redistributions of source code must retain the above copyright
2368411521SHerbert Xu#   notice, this list of conditions and the following disclaimer.
2468411521SHerbert Xu#
2568411521SHerbert Xu# * Redistributions in binary form must reproduce the above copyright
2668411521SHerbert Xu#   notice, this list of conditions and the following disclaimer in the
2768411521SHerbert Xu#   documentation and/or other materials provided with the
2868411521SHerbert Xu#   distribution.
2968411521SHerbert Xu#
3068411521SHerbert Xu# * Neither the name of the Intel Corporation nor the names of its
3168411521SHerbert Xu#   contributors may be used to endorse or promote products derived from
3268411521SHerbert Xu#   this software without specific prior written permission.
3368411521SHerbert Xu#
3468411521SHerbert Xu#
3568411521SHerbert Xu# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
3668411521SHerbert Xu# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3768411521SHerbert Xu# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3868411521SHerbert Xu# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
3968411521SHerbert Xu# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
4068411521SHerbert Xu# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
4168411521SHerbert Xu# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
4268411521SHerbert Xu# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
4368411521SHerbert Xu# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4468411521SHerbert Xu# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4568411521SHerbert Xu# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4668411521SHerbert Xu#
4768411521SHerbert Xu#       Reference paper titled "Fast CRC Computation for Generic
4868411521SHerbert Xu#	Polynomials Using PCLMULQDQ Instruction"
4968411521SHerbert Xu#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
5068411521SHerbert Xu#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
5168411521SHerbert Xu#
5268411521SHerbert Xu
5368411521SHerbert Xu#include <linux/linkage.h>
5468411521SHerbert Xu
5568411521SHerbert Xu.text
5668411521SHerbert Xu
570974037fSEric Biggers#define		init_crc	%edi
580974037fSEric Biggers#define		buf		%rsi
590974037fSEric Biggers#define		len		%rdx
6068411521SHerbert Xu
610974037fSEric Biggers#define		FOLD_CONSTS	%xmm10
620974037fSEric Biggers#define		BSWAP_MASK	%xmm11
6368411521SHerbert Xu
640974037fSEric Biggers# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
650974037fSEric Biggers# reg1, reg2.
660974037fSEric Biggers.macro	fold_32_bytes	offset, reg1, reg2
670974037fSEric Biggers	movdqu	\offset(buf), %xmm9
680974037fSEric Biggers	movdqu	\offset+16(buf), %xmm12
690974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm9
700974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm12
710974037fSEric Biggers	movdqa	\reg1, %xmm8
720974037fSEric Biggers	movdqa	\reg2, %xmm13
730974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, \reg1
740974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm8
750974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, \reg2
760974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm13
770974037fSEric Biggers	pxor	%xmm9 , \reg1
780974037fSEric Biggers	xorps	%xmm8 , \reg1
790974037fSEric Biggers	pxor	%xmm12, \reg2
800974037fSEric Biggers	xorps	%xmm13, \reg2
810974037fSEric Biggers.endm
820974037fSEric Biggers
830974037fSEric Biggers# Fold src_reg into dst_reg.
840974037fSEric Biggers.macro	fold_16_bytes	src_reg, dst_reg
850974037fSEric Biggers	movdqa	\src_reg, %xmm8
860974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, \src_reg
870974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
880974037fSEric Biggers	pxor	%xmm8, \dst_reg
890974037fSEric Biggers	xorps	\src_reg, \dst_reg
900974037fSEric Biggers.endm
910974037fSEric Biggers
920974037fSEric Biggers#
930974037fSEric Biggers# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
940974037fSEric Biggers#
950974037fSEric Biggers# Assumes len >= 16.
960974037fSEric Biggers#
976dcc5627SJiri SlabySYM_FUNC_START(crc_t10dif_pcl)
9868411521SHerbert Xu
990974037fSEric Biggers	movdqa	.Lbswap_mask(%rip), BSWAP_MASK
10068411521SHerbert Xu
1010974037fSEric Biggers	# For sizes less than 256 bytes, we can't fold 128 bytes at a time.
1020974037fSEric Biggers	cmp	$256, len
1030974037fSEric Biggers	jl	.Lless_than_256_bytes
10468411521SHerbert Xu
1050974037fSEric Biggers	# Load the first 128 data bytes.  Byte swapping is necessary to make the
1060974037fSEric Biggers	# bit order match the polynomial coefficient order.
1070974037fSEric Biggers	movdqu	16*0(buf), %xmm0
1080974037fSEric Biggers	movdqu	16*1(buf), %xmm1
1090974037fSEric Biggers	movdqu	16*2(buf), %xmm2
1100974037fSEric Biggers	movdqu	16*3(buf), %xmm3
1110974037fSEric Biggers	movdqu	16*4(buf), %xmm4
1120974037fSEric Biggers	movdqu	16*5(buf), %xmm5
1130974037fSEric Biggers	movdqu	16*6(buf), %xmm6
1140974037fSEric Biggers	movdqu	16*7(buf), %xmm7
1150974037fSEric Biggers	add	$128, buf
1160974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm0
1170974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm1
1180974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm2
1190974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm3
1200974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm4
1210974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm5
1220974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm6
1230974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm7
12468411521SHerbert Xu
1250974037fSEric Biggers	# XOR the first 16 data *bits* with the initial CRC value.
1260974037fSEric Biggers	pxor	%xmm8, %xmm8
1270974037fSEric Biggers	pinsrw	$7, init_crc, %xmm8
1280974037fSEric Biggers	pxor	%xmm8, %xmm0
12968411521SHerbert Xu
1300974037fSEric Biggers	movdqa	.Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
13168411521SHerbert Xu
1320974037fSEric Biggers	# Subtract 128 for the 128 data bytes just consumed.  Subtract another
1330974037fSEric Biggers	# 128 to simplify the termination condition of the following loop.
1340974037fSEric Biggers	sub	$256, len
13568411521SHerbert Xu
1360974037fSEric Biggers	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
1370974037fSEric Biggers	# bytes xmm0-7 into them, storing the result back into xmm0-7.
1380974037fSEric Biggers.Lfold_128_bytes_loop:
1390974037fSEric Biggers	fold_32_bytes	0, %xmm0, %xmm1
1400974037fSEric Biggers	fold_32_bytes	32, %xmm2, %xmm3
1410974037fSEric Biggers	fold_32_bytes	64, %xmm4, %xmm5
1420974037fSEric Biggers	fold_32_bytes	96, %xmm6, %xmm7
1430974037fSEric Biggers	add	$128, buf
1440974037fSEric Biggers	sub	$128, len
1450974037fSEric Biggers	jge	.Lfold_128_bytes_loop
14668411521SHerbert Xu
1470974037fSEric Biggers	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
14868411521SHerbert Xu
1490974037fSEric Biggers	# Fold across 64 bytes.
1500974037fSEric Biggers	movdqa	.Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
1510974037fSEric Biggers	fold_16_bytes	%xmm0, %xmm4
1520974037fSEric Biggers	fold_16_bytes	%xmm1, %xmm5
1530974037fSEric Biggers	fold_16_bytes	%xmm2, %xmm6
1540974037fSEric Biggers	fold_16_bytes	%xmm3, %xmm7
1550974037fSEric Biggers	# Fold across 32 bytes.
1560974037fSEric Biggers	movdqa	.Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
1570974037fSEric Biggers	fold_16_bytes	%xmm4, %xmm6
1580974037fSEric Biggers	fold_16_bytes	%xmm5, %xmm7
1590974037fSEric Biggers	# Fold across 16 bytes.
1600974037fSEric Biggers	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
1610974037fSEric Biggers	fold_16_bytes	%xmm6, %xmm7
16268411521SHerbert Xu
1630974037fSEric Biggers	# Add 128 to get the correct number of data bytes remaining in 0...127
1640974037fSEric Biggers	# (not counting xmm7), following the previous extra subtraction by 128.
1650974037fSEric Biggers	# Then subtract 16 to simplify the termination condition of the
1660974037fSEric Biggers	# following loop.
1670974037fSEric Biggers	add	$128-16, len
16868411521SHerbert Xu
1690974037fSEric Biggers	# While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
1700974037fSEric Biggers	# xmm7 into them, storing the result back into xmm7.
1710974037fSEric Biggers	jl	.Lfold_16_bytes_loop_done
1720974037fSEric Biggers.Lfold_16_bytes_loop:
17368411521SHerbert Xu	movdqa	%xmm7, %xmm8
1740974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
1750974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
17668411521SHerbert Xu	pxor	%xmm8, %xmm7
1770974037fSEric Biggers	movdqu	(buf), %xmm0
1780974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm0
17968411521SHerbert Xu	pxor	%xmm0 , %xmm7
1800974037fSEric Biggers	add	$16, buf
1810974037fSEric Biggers	sub	$16, len
1820974037fSEric Biggers	jge	.Lfold_16_bytes_loop
18368411521SHerbert Xu
1840974037fSEric Biggers.Lfold_16_bytes_loop_done:
1850974037fSEric Biggers	# Add 16 to get the correct number of data bytes remaining in 0...15
1860974037fSEric Biggers	# (not counting xmm7), following the previous extra subtraction by 16.
1870974037fSEric Biggers	add	$16, len
1880974037fSEric Biggers	je	.Lreduce_final_16_bytes
18968411521SHerbert Xu
1900974037fSEric Biggers.Lhandle_partial_segment:
1910974037fSEric Biggers	# Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
1920974037fSEric Biggers	# bytes are in xmm7 and the rest are the remaining data in 'buf'.  To do
1930974037fSEric Biggers	# this without needing a fold constant for each possible 'len', redivide
1940974037fSEric Biggers	# the bytes into a first chunk of 'len' bytes and a second chunk of 16
1950974037fSEric Biggers	# bytes, then fold the first chunk into the second.
19668411521SHerbert Xu
19768411521SHerbert Xu	movdqa	%xmm7, %xmm2
19868411521SHerbert Xu
1990974037fSEric Biggers	# xmm1 = last 16 original data bytes
2000974037fSEric Biggers	movdqu	-16(buf, len), %xmm1
2010974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm1
20268411521SHerbert Xu
2030974037fSEric Biggers	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
2040974037fSEric Biggers	lea	.Lbyteshift_table+16(%rip), %rax
2050974037fSEric Biggers	sub	len, %rax
20668411521SHerbert Xu	movdqu	(%rax), %xmm0
20768411521SHerbert Xu	pshufb	%xmm0, %xmm2
20868411521SHerbert Xu
2090974037fSEric Biggers	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
2100974037fSEric Biggers	pxor	.Lmask1(%rip), %xmm0
21168411521SHerbert Xu	pshufb	%xmm0, %xmm7
2120974037fSEric Biggers
2130974037fSEric Biggers	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
2140974037fSEric Biggers	# then '16-len' bytes from xmm2 (high-order bytes).
21568411521SHerbert Xu	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
21668411521SHerbert Xu
2170974037fSEric Biggers	# Fold the first chunk into the second chunk, storing the result in xmm7.
21868411521SHerbert Xu	movdqa	%xmm7, %xmm8
2190974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
2200974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
22168411521SHerbert Xu	pxor	%xmm8, %xmm7
2220974037fSEric Biggers	pxor	%xmm1, %xmm7
22368411521SHerbert Xu
2240974037fSEric Biggers.Lreduce_final_16_bytes:
2250974037fSEric Biggers	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
2260974037fSEric Biggers
2270974037fSEric Biggers	# Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
2280974037fSEric Biggers	movdqa	.Lfinal_fold_consts(%rip), FOLD_CONSTS
2290974037fSEric Biggers
2300974037fSEric Biggers	# Fold the high 64 bits into the low 64 bits, while also multiplying by
2310974037fSEric Biggers	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
2320974037fSEric Biggers	# whose low 48 bits are 0.
23368411521SHerbert Xu	movdqa	%xmm7, %xmm0
2340974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
23568411521SHerbert Xu	pslldq	$8, %xmm0
2360974037fSEric Biggers	pxor	%xmm0, %xmm7			  # + low bits * x^64
23768411521SHerbert Xu
2380974037fSEric Biggers	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
2390974037fSEric Biggers	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
24068411521SHerbert Xu	movdqa	%xmm7, %xmm0
2410974037fSEric Biggers	pand	.Lmask2(%rip), %xmm0		  # zero high 32 bits
2420974037fSEric Biggers	psrldq	$12, %xmm7			  # extract high 32 bits
2430974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
2440974037fSEric Biggers	pxor	%xmm0, %xmm7			  # + low bits
24568411521SHerbert Xu
2460974037fSEric Biggers	# Load G(x) and floor(x^48 / G(x)).
2470974037fSEric Biggers	movdqa	.Lbarrett_reduction_consts(%rip), FOLD_CONSTS
24868411521SHerbert Xu
2490974037fSEric Biggers	# Use Barrett reduction to compute the final CRC value.
25068411521SHerbert Xu	movdqa	%xmm7, %xmm0
2510974037fSEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
2520974037fSEric Biggers	psrlq	$32, %xmm7			  # /= x^32
2530974037fSEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # *= G(x)
2540974037fSEric Biggers	psrlq	$48, %xmm0
2550974037fSEric Biggers	pxor	%xmm7, %xmm0		     # + low 16 nonzero bits
2560974037fSEric Biggers	# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
25768411521SHerbert Xu
2580974037fSEric Biggers	pextrw	$0, %xmm0, %eax
259*f94909ceSPeter Zijlstra	RET
26068411521SHerbert Xu
26168411521SHerbert Xu.align 16
2620974037fSEric Biggers.Lless_than_256_bytes:
2630974037fSEric Biggers	# Checksumming a buffer of length 16...255 bytes
26468411521SHerbert Xu
2650974037fSEric Biggers	# Load the first 16 data bytes.
2660974037fSEric Biggers	movdqu	(buf), %xmm7
2670974037fSEric Biggers	pshufb	BSWAP_MASK, %xmm7
2680974037fSEric Biggers	add	$16, buf
26968411521SHerbert Xu
2700974037fSEric Biggers	# XOR the first 16 data *bits* with the initial CRC value.
2710974037fSEric Biggers	pxor	%xmm0, %xmm0
2720974037fSEric Biggers	pinsrw	$7, init_crc, %xmm0
27368411521SHerbert Xu	pxor	%xmm0, %xmm7
27468411521SHerbert Xu
2750974037fSEric Biggers	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
2760974037fSEric Biggers	cmp	$16, len
2770974037fSEric Biggers	je	.Lreduce_final_16_bytes		# len == 16
2780974037fSEric Biggers	sub	$32, len
2790974037fSEric Biggers	jge	.Lfold_16_bytes_loop		# 32 <= len <= 255
2800974037fSEric Biggers	add	$16, len
2810974037fSEric Biggers	jmp	.Lhandle_partial_segment	# 17 <= len <= 31
2826dcc5627SJiri SlabySYM_FUNC_END(crc_t10dif_pcl)
28368411521SHerbert Xu
284e183914aSDenys Vlasenko.section	.rodata, "a", @progbits
285e183914aSDenys Vlasenko.align 16
28668411521SHerbert Xu
2870974037fSEric Biggers# Fold constants precomputed from the polynomial 0x18bb7
2880974037fSEric Biggers# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
2890974037fSEric Biggers.Lfold_across_128_bytes_consts:
2900974037fSEric Biggers	.quad		0x0000000000006123	# x^(8*128)	mod G(x)
2910974037fSEric Biggers	.quad		0x0000000000002295	# x^(8*128+64)	mod G(x)
2920974037fSEric Biggers.Lfold_across_64_bytes_consts:
2930974037fSEric Biggers	.quad		0x0000000000001069	# x^(4*128)	mod G(x)
2940974037fSEric Biggers	.quad		0x000000000000dd31	# x^(4*128+64)	mod G(x)
2950974037fSEric Biggers.Lfold_across_32_bytes_consts:
2960974037fSEric Biggers	.quad		0x000000000000857d	# x^(2*128)	mod G(x)
2970974037fSEric Biggers	.quad		0x0000000000007acc	# x^(2*128+64)	mod G(x)
2980974037fSEric Biggers.Lfold_across_16_bytes_consts:
2990974037fSEric Biggers	.quad		0x000000000000a010	# x^(1*128)	mod G(x)
3000974037fSEric Biggers	.quad		0x0000000000001faa	# x^(1*128+64)	mod G(x)
3010974037fSEric Biggers.Lfinal_fold_consts:
3020974037fSEric Biggers	.quad		0x1368000000000000	# x^48 * (x^48 mod G(x))
3030974037fSEric Biggers	.quad		0x2d56000000000000	# x^48 * (x^80 mod G(x))
3040974037fSEric Biggers.Lbarrett_reduction_consts:
3050974037fSEric Biggers	.quad		0x0000000000018bb7	# G(x)
3060974037fSEric Biggers	.quad		0x00000001f65a57f8	# floor(x^48 / G(x))
30768411521SHerbert Xu
308e183914aSDenys Vlasenko.section	.rodata.cst16.mask1, "aM", @progbits, 16
309e183914aSDenys Vlasenko.align 16
3100974037fSEric Biggers.Lmask1:
31168411521SHerbert Xu	.octa	0x80808080808080808080808080808080
312e183914aSDenys Vlasenko
313e183914aSDenys Vlasenko.section	.rodata.cst16.mask2, "aM", @progbits, 16
314e183914aSDenys Vlasenko.align 16
3150974037fSEric Biggers.Lmask2:
31668411521SHerbert Xu	.octa	0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
31768411521SHerbert Xu
3180974037fSEric Biggers.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
319e183914aSDenys Vlasenko.align 16
3200974037fSEric Biggers.Lbswap_mask:
32168411521SHerbert Xu	.octa	0x000102030405060708090A0B0C0D0E0F
32268411521SHerbert Xu
3230974037fSEric Biggers.section	.rodata.cst32.byteshift_table, "aM", @progbits, 32
3240974037fSEric Biggers.align 16
3250974037fSEric Biggers# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
3260974037fSEric Biggers# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
3270974037fSEric Biggers# 0x80} XOR the index vector to shift right by '16 - len' bytes.
3280974037fSEric Biggers.Lbyteshift_table:
3290974037fSEric Biggers	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
3300974037fSEric Biggers	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
3310974037fSEric Biggers	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
3320974037fSEric Biggers	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
333