arm64/crypto/crct10dif-ce-core.S

6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6227cd12SEric Biggers// Copyright (C) 2019 Google LLC <ebiggers@google.com>
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// This program is free software; you can redistribute it and/or modify
6ef5737fSArd Biesheuvel// it under the terms of the GNU General Public License version 2 as
6ef5737fSArd Biesheuvel// published by the Free Software Foundation.
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers// Derived from the x86 version:
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// Copyright (c) 2013, Intel Corporation
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// Authors:
6ef5737fSArd Biesheuvel//     Erdinc Ozturk <erdinc.ozturk@intel.com>
6ef5737fSArd Biesheuvel//     Vinodh Gopal <vinodh.gopal@intel.com>
6ef5737fSArd Biesheuvel//     James Guilford <james.guilford@intel.com>
6ef5737fSArd Biesheuvel//     Tim Chen <tim.c.chen@linux.intel.com>
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// This software is available to you under a choice of one of two
6ef5737fSArd Biesheuvel// licenses.  You may choose to be licensed under the terms of the GNU
6ef5737fSArd Biesheuvel// General Public License (GPL) Version 2, available from the file
6ef5737fSArd Biesheuvel// COPYING in the main directory of this source tree, or the
6ef5737fSArd Biesheuvel// OpenIB.org BSD license below:
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// Redistribution and use in source and binary forms, with or without
6ef5737fSArd Biesheuvel// modification, are permitted provided that the following conditions are
6ef5737fSArd Biesheuvel// met:
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// * Redistributions of source code must retain the above copyright
6ef5737fSArd Biesheuvel//   notice, this list of conditions and the following disclaimer.
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// * Redistributions in binary form must reproduce the above copyright
6ef5737fSArd Biesheuvel//   notice, this list of conditions and the following disclaimer in the
6ef5737fSArd Biesheuvel//   documentation and/or other materials provided with the
6ef5737fSArd Biesheuvel//   distribution.
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// * Neither the name of the Intel Corporation nor the names of its
6ef5737fSArd Biesheuvel//   contributors may be used to endorse or promote products derived from
6ef5737fSArd Biesheuvel//   this software without specific prior written permission.
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
6ef5737fSArd Biesheuvel// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
6ef5737fSArd Biesheuvel// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
6ef5737fSArd Biesheuvel// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
6ef5737fSArd Biesheuvel// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
6ef5737fSArd Biesheuvel// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
6ef5737fSArd Biesheuvel// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
6ef5737fSArd Biesheuvel// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
6ef5737fSArd Biesheuvel// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
6ef5737fSArd Biesheuvel// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
6ef5737fSArd Biesheuvel// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel//       Reference paper titled "Fast CRC Computation for Generic
6ef5737fSArd Biesheuvel//	Polynomials Using PCLMULQDQ Instruction"
6ef5737fSArd Biesheuvel//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
6ef5737fSArd Biesheuvel//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
6ef5737fSArd Biesheuvel//
6ef5737fSArd Biesheuvel
6ef5737fSArd Biesheuvel#include <linux/linkage.h>
6ef5737fSArd Biesheuvel#include <asm/assembler.h>
6ef5737fSArd Biesheuvel
6ef5737fSArd Biesheuvel	.text
3ca73b70SMark Brown	.arch		armv8-a+crypto
6ef5737fSArd Biesheuvel
fc754c02SArd Biesheuvel	init_crc	.req	w0
fc754c02SArd Biesheuvel	buf		.req	x1
fc754c02SArd Biesheuvel	len		.req	x2
fc754c02SArd Biesheuvel	fold_consts_ptr	.req	x3
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	fold_consts	.req	v10
6ef5737fSArd Biesheuvel
2fffee53SArd Biesheuvel	ad		.req	v14
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	k00_16		.req	v15
2fffee53SArd Biesheuvel	k32_48		.req	v16
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	t3		.req	v17
2fffee53SArd Biesheuvel	t4		.req	v18
2fffee53SArd Biesheuvel	t5		.req	v19
2fffee53SArd Biesheuvel	t6		.req	v20
2fffee53SArd Biesheuvel	t7		.req	v21
2fffee53SArd Biesheuvel	t8		.req	v22
2fffee53SArd Biesheuvel	t9		.req	v23
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	perm1		.req	v24
2fffee53SArd Biesheuvel	perm2		.req	v25
2fffee53SArd Biesheuvel	perm3		.req	v26
2fffee53SArd Biesheuvel	perm4		.req	v27
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	bd1		.req	v28
2fffee53SArd Biesheuvel	bd2		.req	v29
2fffee53SArd Biesheuvel	bd3		.req	v30
2fffee53SArd Biesheuvel	bd4		.req	v31
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	.macro		__pmull_init_p64
2fffee53SArd Biesheuvel	.endm
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	.macro		__pmull_pre_p64, bd
2fffee53SArd Biesheuvel	.endm
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	.macro		__pmull_init_p8
2fffee53SArd Biesheuvel	// k00_16 := 0x0000000000000000_000000000000ffff
2fffee53SArd Biesheuvel	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
2fffee53SArd Biesheuvel	movi		k32_48.2d, #0xffffffff
2fffee53SArd Biesheuvel	mov		k32_48.h[2], k32_48.h[0]
2fffee53SArd Biesheuvel	ushr		k00_16.2d, k32_48.2d, #32
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	// prepare the permutation vectors
2fffee53SArd Biesheuvel	mov_q		x5, 0x080f0e0d0c0b0a09
2fffee53SArd Biesheuvel	movi		perm4.8b, #8
2fffee53SArd Biesheuvel	dup		perm1.2d, x5
2fffee53SArd Biesheuvel	eor		perm1.16b, perm1.16b, perm4.16b
2fffee53SArd Biesheuvel	ushr		perm2.2d, perm1.2d, #8
2fffee53SArd Biesheuvel	ushr		perm3.2d, perm1.2d, #16
2fffee53SArd Biesheuvel	ushr		perm4.2d, perm1.2d, #24
2fffee53SArd Biesheuvel	sli		perm2.2d, perm1.2d, #56
2fffee53SArd Biesheuvel	sli		perm3.2d, perm1.2d, #48
2fffee53SArd Biesheuvel	sli		perm4.2d, perm1.2d, #40
2fffee53SArd Biesheuvel	.endm
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	.macro		__pmull_pre_p8, bd
2fffee53SArd Biesheuvel	tbl		bd1.16b, {\bd\().16b}, perm1.16b
2fffee53SArd Biesheuvel	tbl		bd2.16b, {\bd\().16b}, perm2.16b
2fffee53SArd Biesheuvel	tbl		bd3.16b, {\bd\().16b}, perm3.16b
2fffee53SArd Biesheuvel	tbl		bd4.16b, {\bd\().16b}, perm4.16b
2fffee53SArd Biesheuvel	.endm
2fffee53SArd Biesheuvel
0e89640bSMark BrownSYM_FUNC_START_LOCAL(__pmull_p8_core)
2fffee53SArd Biesheuvel.L__pmull_p8_core:
2fffee53SArd Biesheuvel	ext		t4.8b, ad.8b, ad.8b, #1			// A1
2fffee53SArd Biesheuvel	ext		t5.8b, ad.8b, ad.8b, #2			// A2
2fffee53SArd Biesheuvel	ext		t6.8b, ad.8b, ad.8b, #3			// A3
2fffee53SArd Biesheuvel
6227cd12SEric Biggers	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
2fffee53SArd Biesheuvel	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
6227cd12SEric Biggers	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
2fffee53SArd Biesheuvel	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
6227cd12SEric Biggers	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
2fffee53SArd Biesheuvel	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
2fffee53SArd Biesheuvel	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
2fffee53SArd Biesheuvel	b		0f
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel.L__pmull_p8_core2:
2fffee53SArd Biesheuvel	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
2fffee53SArd Biesheuvel	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
2fffee53SArd Biesheuvel	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
2fffee53SArd Biesheuvel
6227cd12SEric Biggers	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
2fffee53SArd Biesheuvel	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
6227cd12SEric Biggers	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
2fffee53SArd Biesheuvel	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
6227cd12SEric Biggers	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
2fffee53SArd Biesheuvel	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
2fffee53SArd Biesheuvel	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
2fffee53SArd Biesheuvel	eor		t5.16b, t5.16b, t7.16b			// M = G + H
2fffee53SArd Biesheuvel	eor		t6.16b, t6.16b, t9.16b			// N = I + J
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	uzp1		t8.2d, t4.2d, t5.2d
2fffee53SArd Biesheuvel	uzp2		t4.2d, t4.2d, t5.2d
2fffee53SArd Biesheuvel	uzp1		t7.2d, t6.2d, t3.2d
2fffee53SArd Biesheuvel	uzp2		t6.2d, t6.2d, t3.2d
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	// t4 = (L) (P0 + P1) << 8
2fffee53SArd Biesheuvel	// t5 = (M) (P2 + P3) << 16
2fffee53SArd Biesheuvel	eor		t8.16b, t8.16b, t4.16b
2fffee53SArd Biesheuvel	and		t4.16b, t4.16b, k32_48.16b
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	// t6 = (N) (P4 + P5) << 24
2fffee53SArd Biesheuvel	// t7 = (K) (P6 + P7) << 32
2fffee53SArd Biesheuvel	eor		t7.16b, t7.16b, t6.16b
2fffee53SArd Biesheuvel	and		t6.16b, t6.16b, k00_16.16b
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	eor		t8.16b, t8.16b, t4.16b
2fffee53SArd Biesheuvel	eor		t7.16b, t7.16b, t6.16b
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	zip2		t5.2d, t8.2d, t4.2d
2fffee53SArd Biesheuvel	zip1		t4.2d, t8.2d, t4.2d
2fffee53SArd Biesheuvel	zip2		t3.2d, t7.2d, t6.2d
2fffee53SArd Biesheuvel	zip1		t6.2d, t7.2d, t6.2d
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	ext		t4.16b, t4.16b, t4.16b, #15
2fffee53SArd Biesheuvel	ext		t5.16b, t5.16b, t5.16b, #14
2fffee53SArd Biesheuvel	ext		t6.16b, t6.16b, t6.16b, #13
2fffee53SArd Biesheuvel	ext		t3.16b, t3.16b, t3.16b, #12
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	eor		t4.16b, t4.16b, t5.16b
2fffee53SArd Biesheuvel	eor		t6.16b, t6.16b, t3.16b
2fffee53SArd Biesheuvel	ret
0e89640bSMark BrownSYM_FUNC_END(__pmull_p8_core)
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	.macro		__pmull_p8, rq, ad, bd, i
6227cd12SEric Biggers	.ifnc		\bd, fold_consts
2fffee53SArd Biesheuvel	.err
2fffee53SArd Biesheuvel	.endif
2fffee53SArd Biesheuvel	mov		ad.16b, \ad\().16b
2fffee53SArd Biesheuvel	.ifb		\i
6227cd12SEric Biggers	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
2fffee53SArd Biesheuvel	.else
6227cd12SEric Biggers	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
2fffee53SArd Biesheuvel	.endif
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	bl		.L__pmull_p8_core\i
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	eor		\rq\().16b, \rq\().16b, t4.16b
2fffee53SArd Biesheuvel	eor		\rq\().16b, \rq\().16b, t6.16b
2fffee53SArd Biesheuvel	.endm
2fffee53SArd Biesheuvel
6227cd12SEric Biggers	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
6227cd12SEric Biggers	// into reg1, reg2.
6227cd12SEric Biggers	.macro		fold_32_bytes, p, reg1, reg2
6227cd12SEric Biggers	ldp		q11, q12, [buf], #0x20
6c1b0da1SArd Biesheuvel
6227cd12SEric Biggers	__pmull_\p	v8, \reg1, fold_consts, 2
6227cd12SEric Biggers	__pmull_\p	\reg1, \reg1, fold_consts
6c1b0da1SArd Biesheuvel
6c1b0da1SArd BiesheuvelCPU_LE(	rev64		v11.16b, v11.16b		)
6c1b0da1SArd BiesheuvelCPU_LE(	rev64		v12.16b, v12.16b		)
6c1b0da1SArd Biesheuvel
6227cd12SEric Biggers	__pmull_\p	v9, \reg2, fold_consts, 2
6227cd12SEric Biggers	__pmull_\p	\reg2, \reg2, fold_consts
6c1b0da1SArd Biesheuvel
6c1b0da1SArd BiesheuvelCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
6c1b0da1SArd BiesheuvelCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
6c1b0da1SArd Biesheuvel
6c1b0da1SArd Biesheuvel	eor		\reg1\().16b, \reg1\().16b, v8.16b
6c1b0da1SArd Biesheuvel	eor		\reg2\().16b, \reg2\().16b, v9.16b
6c1b0da1SArd Biesheuvel	eor		\reg1\().16b, \reg1\().16b, v11.16b
6c1b0da1SArd Biesheuvel	eor		\reg2\().16b, \reg2\().16b, v12.16b
6c1b0da1SArd Biesheuvel	.endm
6c1b0da1SArd Biesheuvel
6227cd12SEric Biggers	// Fold src_reg into dst_reg, optionally loading the next fold constants
6227cd12SEric Biggers	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
6227cd12SEric Biggers	__pmull_\p	v8, \src_reg, fold_consts
6227cd12SEric Biggers	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
6227cd12SEric Biggers	.ifnb		\load_next_consts
6227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
6227cd12SEric Biggers	__pmull_pre_\p	fold_consts
6c1b0da1SArd Biesheuvel	.endif
6227cd12SEric Biggers	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
6227cd12SEric Biggers	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
6c1b0da1SArd Biesheuvel	.endm
6c1b0da1SArd Biesheuvel
6c1b0da1SArd Biesheuvel	.macro		__pmull_p64, rd, rn, rm, n
6c1b0da1SArd Biesheuvel	.ifb		\n
6c1b0da1SArd Biesheuvel	pmull		\rd\().1q, \rn\().1d, \rm\().1d
6c1b0da1SArd Biesheuvel	.else
6c1b0da1SArd Biesheuvel	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
6c1b0da1SArd Biesheuvel	.endif
6c1b0da1SArd Biesheuvel	.endm
6c1b0da1SArd Biesheuvel
6c1b0da1SArd Biesheuvel	.macro		crc_t10dif_pmull, p
2fffee53SArd Biesheuvel	__pmull_init_\p
2fffee53SArd Biesheuvel
6227cd12SEric Biggers	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
6227cd12SEric Biggers	cmp		len, #256
6227cd12SEric Biggers	b.lt		.Lless_than_256_bytes_\@
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Load the first 128 data bytes.  Byte swapping is necessary to make
6227cd12SEric Biggers	// the bit order match the polynomial coefficient order.
6227cd12SEric Biggers	ldp		q0, q1, [buf]
6227cd12SEric Biggers	ldp		q2, q3, [buf, #0x20]
6227cd12SEric Biggers	ldp		q4, q5, [buf, #0x40]
6227cd12SEric Biggers	ldp		q6, q7, [buf, #0x60]
6227cd12SEric Biggers	add		buf, buf, #0x80
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v0.16b, v0.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v1.16b, v1.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v2.16b, v2.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v3.16b, v3.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v4.16b, v4.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v5.16b, v5.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v6.16b, v6.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v7.16b, v7.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// XOR the first 16 data *bits* with the initial CRC value.
6227cd12SEric Biggers	movi		v8.16b, #0
6227cd12SEric Biggers	mov		v8.h[7], init_crc
6227cd12SEric Biggers	eor		v0.16b, v0.16b, v8.16b
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Load the constants for folding across 128 bytes.
6227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr]
6227cd12SEric Biggers	__pmull_pre_\p	fold_consts
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
6227cd12SEric Biggers	// 128 to simplify the termination condition of the following loop.
6227cd12SEric Biggers	sub		len, len, #256
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
6227cd12SEric Biggers	// bytes v0-v7 into them, storing the result back into v0-v7.
6227cd12SEric Biggers.Lfold_128_bytes_loop_\@:
6227cd12SEric Biggers	fold_32_bytes	\p, v0, v1
6227cd12SEric Biggers	fold_32_bytes	\p, v2, v3
6227cd12SEric Biggers	fold_32_bytes	\p, v4, v5
6227cd12SEric Biggers	fold_32_bytes	\p, v6, v7
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	subs		len, len, #128
fc754c02SArd Biesheuvel	b.ge		.Lfold_128_bytes_loop_\@
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Fold across 64 bytes.
6227cd12SEric Biggers	add		fold_consts_ptr, fold_consts_ptr, #16
6227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
6227cd12SEric Biggers	__pmull_pre_\p	fold_consts
6227cd12SEric Biggers	fold_16_bytes	\p, v0, v4
6227cd12SEric Biggers	fold_16_bytes	\p, v1, v5
6227cd12SEric Biggers	fold_16_bytes	\p, v2, v6
6227cd12SEric Biggers	fold_16_bytes	\p, v3, v7, 1
6227cd12SEric Biggers	// Fold across 32 bytes.
6227cd12SEric Biggers	fold_16_bytes	\p, v4, v6
6227cd12SEric Biggers	fold_16_bytes	\p, v5, v7, 1
6227cd12SEric Biggers	// Fold across 16 bytes.
6227cd12SEric Biggers	fold_16_bytes	\p, v6, v7
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Add 128 to get the correct number of data bytes remaining in 0...127
6227cd12SEric Biggers	// (not counting v7), following the previous extra subtraction by 128.
6227cd12SEric Biggers	// Then subtract 16 to simplify the termination condition of the
6227cd12SEric Biggers	// following loop.
6227cd12SEric Biggers	adds		len, len, #(128-16)
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
6227cd12SEric Biggers	// into them, storing the result back into v7.
6227cd12SEric Biggers	b.lt		.Lfold_16_bytes_loop_done_\@
6227cd12SEric Biggers.Lfold_16_bytes_loop_\@:
6227cd12SEric Biggers	__pmull_\p	v8, v7, fold_consts
6227cd12SEric Biggers	__pmull_\p	v7, v7, fold_consts, 2
6ef5737fSArd Biesheuvel	eor		v7.16b, v7.16b, v8.16b
6227cd12SEric Biggers	ldr		q0, [buf], #16
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v0.16b, v0.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
6ef5737fSArd Biesheuvel	eor		v7.16b, v7.16b, v0.16b
6227cd12SEric Biggers	subs		len, len, #16
6227cd12SEric Biggers	b.ge		.Lfold_16_bytes_loop_\@
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers.Lfold_16_bytes_loop_done_\@:
6227cd12SEric Biggers	// Add 16 to get the correct number of data bytes remaining in 0...15
6227cd12SEric Biggers	// (not counting v7), following the previous extra subtraction by 16.
6227cd12SEric Biggers	adds		len, len, #16
6227cd12SEric Biggers	b.eq		.Lreduce_final_16_bytes_\@
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers.Lhandle_partial_segment_\@:
6227cd12SEric Biggers	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
6227cd12SEric Biggers	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
6227cd12SEric Biggers	// do this without needing a fold constant for each possible 'len',
6227cd12SEric Biggers	// redivide the bytes into a first chunk of 'len' bytes and a second
6227cd12SEric Biggers	// chunk of 16 bytes, then fold the first chunk into the second.
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// v0 = last 16 original data bytes
6227cd12SEric Biggers	add		buf, buf, len
6227cd12SEric Biggers	ldr		q0, [buf, #-16]
6227cd12SEric BiggersCPU_LE(	rev64		v0.16b, v0.16b			)
6227cd12SEric BiggersCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
6227cd12SEric Biggers	adr_l		x4, .Lbyteshift_table + 16
6227cd12SEric Biggers	sub		x4, x4, len
6227cd12SEric Biggers	ld1		{v2.16b}, [x4]
6227cd12SEric Biggers	tbl		v1.16b, {v7.16b}, v2.16b
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
6227cd12SEric Biggers	movi		v3.16b, #0x80
6227cd12SEric Biggers	eor		v2.16b, v2.16b, v3.16b
6227cd12SEric Biggers	tbl		v3.16b, {v7.16b}, v2.16b
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
6227cd12SEric Biggers	sshr		v2.16b, v2.16b, #7
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
6227cd12SEric Biggers	// then '16-len' bytes from v1 (high-order bytes).
6227cd12SEric Biggers	bsl		v2.16b, v1.16b, v0.16b
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Fold the first chunk into the second chunk, storing the result in v7.
6227cd12SEric Biggers	__pmull_\p	v0, v3, fold_consts
6227cd12SEric Biggers	__pmull_\p	v7, v3, fold_consts, 2
6ef5737fSArd Biesheuvel	eor		v7.16b, v7.16b, v0.16b
6227cd12SEric Biggers	eor		v7.16b, v7.16b, v2.16b
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers.Lreduce_final_16_bytes_\@:
6227cd12SEric Biggers	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	movi		v2.16b, #0		// init zero register
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
6227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
6227cd12SEric Biggers	__pmull_pre_\p	fold_consts
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Fold the high 64 bits into the low 64 bits, while also multiplying by
6227cd12SEric Biggers	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
6227cd12SEric Biggers	// whose low 48 bits are 0.
6227cd12SEric Biggers	ext		v0.16b, v2.16b, v7.16b, #8
6227cd12SEric Biggers	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
6227cd12SEric Biggers	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
6227cd12SEric Biggers	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
6227cd12SEric Biggers	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
6227cd12SEric Biggers	mov		v0.s[3], v2.s[0]	// zero high 32 bits
6227cd12SEric Biggers	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
6227cd12SEric Biggers	eor		v0.16b, v0.16b, v1.16b	// + low bits
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Load G(x) and floor(x^48 / G(x)).
6227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr]
6227cd12SEric Biggers	__pmull_pre_\p	fold_consts
6227cd12SEric Biggers
6227cd12SEric Biggers	// Use Barrett reduction to compute the final CRC value.
6227cd12SEric Biggers	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
6227cd12SEric Biggers	ushr		v1.2d, v1.2d, #32	// /= x^32
6227cd12SEric Biggers	__pmull_\p	v1, v1, fold_consts	// *= G(x)
6227cd12SEric Biggers	ushr		v0.2d, v0.2d, #48
6227cd12SEric Biggers	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
6227cd12SEric Biggers	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
6227cd12SEric Biggers
6227cd12SEric Biggers	umov		w0, v0.h[0]
fc754c02SArd Biesheuvel	.ifc		\p, p8
*489a4a05SArd Biesheuvel	frame_pop
fc754c02SArd Biesheuvel	.endif
6ef5737fSArd Biesheuvel	ret
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers.Lless_than_256_bytes_\@:
6227cd12SEric Biggers	// Checksumming a buffer of length 16...255 bytes
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Load the first 16 data bytes.
6227cd12SEric Biggers	ldr		q7, [buf], #0x10
6ef5737fSArd BiesheuvelCPU_LE(	rev64		v7.16b, v7.16b			)
6ef5737fSArd BiesheuvelCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// XOR the first 16 data *bits* with the initial CRC value.
6227cd12SEric Biggers	movi		v0.16b, #0
6227cd12SEric Biggers	mov		v0.h[7], init_crc
6227cd12SEric Biggers	eor		v7.16b, v7.16b, v0.16b
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	// Load the fold-across-16-bytes constants.
6227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
6227cd12SEric Biggers	__pmull_pre_\p	fold_consts
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers	cmp		len, #16
6227cd12SEric Biggers	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
6227cd12SEric Biggers	subs		len, len, #32
6227cd12SEric Biggers	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
6227cd12SEric Biggers	add		len, len, #16
6227cd12SEric Biggers	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
6c1b0da1SArd Biesheuvel	.endm
6c1b0da1SArd Biesheuvel
6227cd12SEric Biggers//
6227cd12SEric Biggers// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
6227cd12SEric Biggers//
6227cd12SEric Biggers// Assumes len >= 16.
6227cd12SEric Biggers//
0e89640bSMark BrownSYM_FUNC_START(crc_t10dif_pmull_p8)
*489a4a05SArd Biesheuvel	frame_push	1
2fffee53SArd Biesheuvel	crc_t10dif_pmull p8
0e89640bSMark BrownSYM_FUNC_END(crc_t10dif_pmull_p8)
2fffee53SArd Biesheuvel
2fffee53SArd Biesheuvel	.align		5
6227cd12SEric Biggers//
6227cd12SEric Biggers// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
6227cd12SEric Biggers//
6227cd12SEric Biggers// Assumes len >= 16.
6227cd12SEric Biggers//
0e89640bSMark BrownSYM_FUNC_START(crc_t10dif_pmull_p64)
6c1b0da1SArd Biesheuvel	crc_t10dif_pmull	p64
0e89640bSMark BrownSYM_FUNC_END(crc_t10dif_pmull_p64)
6ef5737fSArd Biesheuvel
325f562dSArd Biesheuvel	.section	".rodata", "a"
6ef5737fSArd Biesheuvel	.align		4
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers// Fold constants precomputed from the polynomial 0x18bb7
6227cd12SEric Biggers// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
6227cd12SEric Biggers.Lfold_across_128_bytes_consts:
6227cd12SEric Biggers	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
6227cd12SEric Biggers	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
6227cd12SEric Biggers// .Lfold_across_64_bytes_consts:
6227cd12SEric Biggers	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
6227cd12SEric Biggers	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
6227cd12SEric Biggers// .Lfold_across_32_bytes_consts:
6227cd12SEric Biggers	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
6227cd12SEric Biggers	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
6227cd12SEric Biggers.Lfold_across_16_bytes_consts:
6227cd12SEric Biggers	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
6227cd12SEric Biggers	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
6227cd12SEric Biggers// .Lfinal_fold_consts:
6227cd12SEric Biggers	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
6227cd12SEric Biggers	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
6227cd12SEric Biggers// .Lbarrett_reduction_consts:
6227cd12SEric Biggers	.quad		0x0000000000018bb7	// G(x)
6227cd12SEric Biggers	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
6ef5737fSArd Biesheuvel
6227cd12SEric Biggers// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
6227cd12SEric Biggers// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
6227cd12SEric Biggers// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
6227cd12SEric Biggers.Lbyteshift_table:
6ef5737fSArd Biesheuvel	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
6ef5737fSArd Biesheuvel	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
6ef5737fSArd Biesheuvel	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
6ef5737fSArd Biesheuvel	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0