16ef5737fSArd Biesheuvel// 26ef5737fSArd Biesheuvel// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions 36ef5737fSArd Biesheuvel// 46ef5737fSArd Biesheuvel// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 56227cd12SEric Biggers// Copyright (C) 2019 Google LLC <ebiggers@google.com> 66ef5737fSArd Biesheuvel// 76ef5737fSArd Biesheuvel// This program is free software; you can redistribute it and/or modify 86ef5737fSArd Biesheuvel// it under the terms of the GNU General Public License version 2 as 96ef5737fSArd Biesheuvel// published by the Free Software Foundation. 106ef5737fSArd Biesheuvel// 116ef5737fSArd Biesheuvel 126227cd12SEric Biggers// Derived from the x86 version: 136ef5737fSArd Biesheuvel// 146ef5737fSArd Biesheuvel// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 156ef5737fSArd Biesheuvel// 166ef5737fSArd Biesheuvel// Copyright (c) 2013, Intel Corporation 176ef5737fSArd Biesheuvel// 186ef5737fSArd Biesheuvel// Authors: 196ef5737fSArd Biesheuvel// Erdinc Ozturk <erdinc.ozturk@intel.com> 206ef5737fSArd Biesheuvel// Vinodh Gopal <vinodh.gopal@intel.com> 216ef5737fSArd Biesheuvel// James Guilford <james.guilford@intel.com> 226ef5737fSArd Biesheuvel// Tim Chen <tim.c.chen@linux.intel.com> 236ef5737fSArd Biesheuvel// 246ef5737fSArd Biesheuvel// This software is available to you under a choice of one of two 256ef5737fSArd Biesheuvel// licenses. You may choose to be licensed under the terms of the GNU 266ef5737fSArd Biesheuvel// General Public License (GPL) Version 2, available from the file 276ef5737fSArd Biesheuvel// COPYING in the main directory of this source tree, or the 286ef5737fSArd Biesheuvel// OpenIB.org BSD license below: 296ef5737fSArd Biesheuvel// 306ef5737fSArd Biesheuvel// Redistribution and use in source and binary forms, with or without 316ef5737fSArd Biesheuvel// modification, are permitted provided that the following conditions are 326ef5737fSArd Biesheuvel// met: 336ef5737fSArd Biesheuvel// 346ef5737fSArd Biesheuvel// * Redistributions of source code must retain the above copyright 356ef5737fSArd Biesheuvel// notice, this list of conditions and the following disclaimer. 366ef5737fSArd Biesheuvel// 376ef5737fSArd Biesheuvel// * Redistributions in binary form must reproduce the above copyright 386ef5737fSArd Biesheuvel// notice, this list of conditions and the following disclaimer in the 396ef5737fSArd Biesheuvel// documentation and/or other materials provided with the 406ef5737fSArd Biesheuvel// distribution. 416ef5737fSArd Biesheuvel// 426ef5737fSArd Biesheuvel// * Neither the name of the Intel Corporation nor the names of its 436ef5737fSArd Biesheuvel// contributors may be used to endorse or promote products derived from 446ef5737fSArd Biesheuvel// this software without specific prior written permission. 456ef5737fSArd Biesheuvel// 466ef5737fSArd Biesheuvel// 476ef5737fSArd Biesheuvel// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 486ef5737fSArd Biesheuvel// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 496ef5737fSArd Biesheuvel// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 506ef5737fSArd Biesheuvel// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 516ef5737fSArd Biesheuvel// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 526ef5737fSArd Biesheuvel// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 536ef5737fSArd Biesheuvel// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 546ef5737fSArd Biesheuvel// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 556ef5737fSArd Biesheuvel// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 566ef5737fSArd Biesheuvel// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 576ef5737fSArd Biesheuvel// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 586ef5737fSArd Biesheuvel// 596ef5737fSArd Biesheuvel// Reference paper titled "Fast CRC Computation for Generic 606ef5737fSArd Biesheuvel// Polynomials Using PCLMULQDQ Instruction" 616ef5737fSArd Biesheuvel// URL: http://www.intel.com/content/dam/www/public/us/en/documents 626ef5737fSArd Biesheuvel// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 636ef5737fSArd Biesheuvel// 646ef5737fSArd Biesheuvel 656ef5737fSArd Biesheuvel#include <linux/linkage.h> 666ef5737fSArd Biesheuvel#include <asm/assembler.h> 676ef5737fSArd Biesheuvel 686ef5737fSArd Biesheuvel .text 693ca73b70SMark Brown .arch armv8-a+crypto 706ef5737fSArd Biesheuvel 71fc754c02SArd Biesheuvel init_crc .req w0 72fc754c02SArd Biesheuvel buf .req x1 73fc754c02SArd Biesheuvel len .req x2 74fc754c02SArd Biesheuvel fold_consts_ptr .req x3 756ef5737fSArd Biesheuvel 766227cd12SEric Biggers fold_consts .req v10 776ef5737fSArd Biesheuvel 782fffee53SArd Biesheuvel ad .req v14 792fffee53SArd Biesheuvel 802fffee53SArd Biesheuvel k00_16 .req v15 812fffee53SArd Biesheuvel k32_48 .req v16 822fffee53SArd Biesheuvel 832fffee53SArd Biesheuvel t3 .req v17 842fffee53SArd Biesheuvel t4 .req v18 852fffee53SArd Biesheuvel t5 .req v19 862fffee53SArd Biesheuvel t6 .req v20 872fffee53SArd Biesheuvel t7 .req v21 882fffee53SArd Biesheuvel t8 .req v22 892fffee53SArd Biesheuvel t9 .req v23 902fffee53SArd Biesheuvel 912fffee53SArd Biesheuvel perm1 .req v24 922fffee53SArd Biesheuvel perm2 .req v25 932fffee53SArd Biesheuvel perm3 .req v26 942fffee53SArd Biesheuvel perm4 .req v27 952fffee53SArd Biesheuvel 962fffee53SArd Biesheuvel bd1 .req v28 972fffee53SArd Biesheuvel bd2 .req v29 982fffee53SArd Biesheuvel bd3 .req v30 992fffee53SArd Biesheuvel bd4 .req v31 1002fffee53SArd Biesheuvel 1012fffee53SArd Biesheuvel .macro __pmull_init_p64 1022fffee53SArd Biesheuvel .endm 1032fffee53SArd Biesheuvel 1042fffee53SArd Biesheuvel .macro __pmull_pre_p64, bd 1052fffee53SArd Biesheuvel .endm 1062fffee53SArd Biesheuvel 1072fffee53SArd Biesheuvel .macro __pmull_init_p8 1082fffee53SArd Biesheuvel // k00_16 := 0x0000000000000000_000000000000ffff 1092fffee53SArd Biesheuvel // k32_48 := 0x00000000ffffffff_0000ffffffffffff 1102fffee53SArd Biesheuvel movi k32_48.2d, #0xffffffff 1112fffee53SArd Biesheuvel mov k32_48.h[2], k32_48.h[0] 1122fffee53SArd Biesheuvel ushr k00_16.2d, k32_48.2d, #32 1132fffee53SArd Biesheuvel 1142fffee53SArd Biesheuvel // prepare the permutation vectors 1152fffee53SArd Biesheuvel mov_q x5, 0x080f0e0d0c0b0a09 1162fffee53SArd Biesheuvel movi perm4.8b, #8 1172fffee53SArd Biesheuvel dup perm1.2d, x5 1182fffee53SArd Biesheuvel eor perm1.16b, perm1.16b, perm4.16b 1192fffee53SArd Biesheuvel ushr perm2.2d, perm1.2d, #8 1202fffee53SArd Biesheuvel ushr perm3.2d, perm1.2d, #16 1212fffee53SArd Biesheuvel ushr perm4.2d, perm1.2d, #24 1222fffee53SArd Biesheuvel sli perm2.2d, perm1.2d, #56 1232fffee53SArd Biesheuvel sli perm3.2d, perm1.2d, #48 1242fffee53SArd Biesheuvel sli perm4.2d, perm1.2d, #40 1252fffee53SArd Biesheuvel .endm 1262fffee53SArd Biesheuvel 1272fffee53SArd Biesheuvel .macro __pmull_pre_p8, bd 1282fffee53SArd Biesheuvel tbl bd1.16b, {\bd\().16b}, perm1.16b 1292fffee53SArd Biesheuvel tbl bd2.16b, {\bd\().16b}, perm2.16b 1302fffee53SArd Biesheuvel tbl bd3.16b, {\bd\().16b}, perm3.16b 1312fffee53SArd Biesheuvel tbl bd4.16b, {\bd\().16b}, perm4.16b 1322fffee53SArd Biesheuvel .endm 1332fffee53SArd Biesheuvel 1340e89640bSMark BrownSYM_FUNC_START_LOCAL(__pmull_p8_core) 1352fffee53SArd Biesheuvel.L__pmull_p8_core: 1362fffee53SArd Biesheuvel ext t4.8b, ad.8b, ad.8b, #1 // A1 1372fffee53SArd Biesheuvel ext t5.8b, ad.8b, ad.8b, #2 // A2 1382fffee53SArd Biesheuvel ext t6.8b, ad.8b, ad.8b, #3 // A3 1392fffee53SArd Biesheuvel 1406227cd12SEric Biggers pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B 1412fffee53SArd Biesheuvel pmull t8.8h, ad.8b, bd1.8b // E = A*B1 1426227cd12SEric Biggers pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B 1432fffee53SArd Biesheuvel pmull t7.8h, ad.8b, bd2.8b // G = A*B2 1446227cd12SEric Biggers pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B 1452fffee53SArd Biesheuvel pmull t9.8h, ad.8b, bd3.8b // I = A*B3 1462fffee53SArd Biesheuvel pmull t3.8h, ad.8b, bd4.8b // K = A*B4 1472fffee53SArd Biesheuvel b 0f 1482fffee53SArd Biesheuvel 1492fffee53SArd Biesheuvel.L__pmull_p8_core2: 1502fffee53SArd Biesheuvel tbl t4.16b, {ad.16b}, perm1.16b // A1 1512fffee53SArd Biesheuvel tbl t5.16b, {ad.16b}, perm2.16b // A2 1522fffee53SArd Biesheuvel tbl t6.16b, {ad.16b}, perm3.16b // A3 1532fffee53SArd Biesheuvel 1546227cd12SEric Biggers pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B 1552fffee53SArd Biesheuvel pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 1566227cd12SEric Biggers pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B 1572fffee53SArd Biesheuvel pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 1586227cd12SEric Biggers pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B 1592fffee53SArd Biesheuvel pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 1602fffee53SArd Biesheuvel pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 1612fffee53SArd Biesheuvel 1622fffee53SArd Biesheuvel0: eor t4.16b, t4.16b, t8.16b // L = E + F 1632fffee53SArd Biesheuvel eor t5.16b, t5.16b, t7.16b // M = G + H 1642fffee53SArd Biesheuvel eor t6.16b, t6.16b, t9.16b // N = I + J 1652fffee53SArd Biesheuvel 1662fffee53SArd Biesheuvel uzp1 t8.2d, t4.2d, t5.2d 1672fffee53SArd Biesheuvel uzp2 t4.2d, t4.2d, t5.2d 1682fffee53SArd Biesheuvel uzp1 t7.2d, t6.2d, t3.2d 1692fffee53SArd Biesheuvel uzp2 t6.2d, t6.2d, t3.2d 1702fffee53SArd Biesheuvel 1712fffee53SArd Biesheuvel // t4 = (L) (P0 + P1) << 8 1722fffee53SArd Biesheuvel // t5 = (M) (P2 + P3) << 16 1732fffee53SArd Biesheuvel eor t8.16b, t8.16b, t4.16b 1742fffee53SArd Biesheuvel and t4.16b, t4.16b, k32_48.16b 1752fffee53SArd Biesheuvel 1762fffee53SArd Biesheuvel // t6 = (N) (P4 + P5) << 24 1772fffee53SArd Biesheuvel // t7 = (K) (P6 + P7) << 32 1782fffee53SArd Biesheuvel eor t7.16b, t7.16b, t6.16b 1792fffee53SArd Biesheuvel and t6.16b, t6.16b, k00_16.16b 1802fffee53SArd Biesheuvel 1812fffee53SArd Biesheuvel eor t8.16b, t8.16b, t4.16b 1822fffee53SArd Biesheuvel eor t7.16b, t7.16b, t6.16b 1832fffee53SArd Biesheuvel 1842fffee53SArd Biesheuvel zip2 t5.2d, t8.2d, t4.2d 1852fffee53SArd Biesheuvel zip1 t4.2d, t8.2d, t4.2d 1862fffee53SArd Biesheuvel zip2 t3.2d, t7.2d, t6.2d 1872fffee53SArd Biesheuvel zip1 t6.2d, t7.2d, t6.2d 1882fffee53SArd Biesheuvel 1892fffee53SArd Biesheuvel ext t4.16b, t4.16b, t4.16b, #15 1902fffee53SArd Biesheuvel ext t5.16b, t5.16b, t5.16b, #14 1912fffee53SArd Biesheuvel ext t6.16b, t6.16b, t6.16b, #13 1922fffee53SArd Biesheuvel ext t3.16b, t3.16b, t3.16b, #12 1932fffee53SArd Biesheuvel 1942fffee53SArd Biesheuvel eor t4.16b, t4.16b, t5.16b 1952fffee53SArd Biesheuvel eor t6.16b, t6.16b, t3.16b 1962fffee53SArd Biesheuvel ret 1970e89640bSMark BrownSYM_FUNC_END(__pmull_p8_core) 1982fffee53SArd Biesheuvel 1992fffee53SArd Biesheuvel .macro __pmull_p8, rq, ad, bd, i 2006227cd12SEric Biggers .ifnc \bd, fold_consts 2012fffee53SArd Biesheuvel .err 2022fffee53SArd Biesheuvel .endif 2032fffee53SArd Biesheuvel mov ad.16b, \ad\().16b 2042fffee53SArd Biesheuvel .ifb \i 2056227cd12SEric Biggers pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B 2062fffee53SArd Biesheuvel .else 2076227cd12SEric Biggers pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B 2082fffee53SArd Biesheuvel .endif 2092fffee53SArd Biesheuvel 2102fffee53SArd Biesheuvel bl .L__pmull_p8_core\i 2112fffee53SArd Biesheuvel 2122fffee53SArd Biesheuvel eor \rq\().16b, \rq\().16b, t4.16b 2132fffee53SArd Biesheuvel eor \rq\().16b, \rq\().16b, t6.16b 2142fffee53SArd Biesheuvel .endm 2152fffee53SArd Biesheuvel 2166227cd12SEric Biggers // Fold reg1, reg2 into the next 32 data bytes, storing the result back 2176227cd12SEric Biggers // into reg1, reg2. 2186227cd12SEric Biggers .macro fold_32_bytes, p, reg1, reg2 2196227cd12SEric Biggers ldp q11, q12, [buf], #0x20 2206c1b0da1SArd Biesheuvel 2216227cd12SEric Biggers __pmull_\p v8, \reg1, fold_consts, 2 2226227cd12SEric Biggers __pmull_\p \reg1, \reg1, fold_consts 2236c1b0da1SArd Biesheuvel 2246c1b0da1SArd BiesheuvelCPU_LE( rev64 v11.16b, v11.16b ) 2256c1b0da1SArd BiesheuvelCPU_LE( rev64 v12.16b, v12.16b ) 2266c1b0da1SArd Biesheuvel 2276227cd12SEric Biggers __pmull_\p v9, \reg2, fold_consts, 2 2286227cd12SEric Biggers __pmull_\p \reg2, \reg2, fold_consts 2296c1b0da1SArd Biesheuvel 2306c1b0da1SArd BiesheuvelCPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) 2316c1b0da1SArd BiesheuvelCPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) 2326c1b0da1SArd Biesheuvel 2336c1b0da1SArd Biesheuvel eor \reg1\().16b, \reg1\().16b, v8.16b 2346c1b0da1SArd Biesheuvel eor \reg2\().16b, \reg2\().16b, v9.16b 2356c1b0da1SArd Biesheuvel eor \reg1\().16b, \reg1\().16b, v11.16b 2366c1b0da1SArd Biesheuvel eor \reg2\().16b, \reg2\().16b, v12.16b 2376c1b0da1SArd Biesheuvel .endm 2386c1b0da1SArd Biesheuvel 2396227cd12SEric Biggers // Fold src_reg into dst_reg, optionally loading the next fold constants 2406227cd12SEric Biggers .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts 2416227cd12SEric Biggers __pmull_\p v8, \src_reg, fold_consts 2426227cd12SEric Biggers __pmull_\p \src_reg, \src_reg, fold_consts, 2 2436227cd12SEric Biggers .ifnb \load_next_consts 2446227cd12SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 2456227cd12SEric Biggers __pmull_pre_\p fold_consts 2466c1b0da1SArd Biesheuvel .endif 2476227cd12SEric Biggers eor \dst_reg\().16b, \dst_reg\().16b, v8.16b 2486227cd12SEric Biggers eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b 2496c1b0da1SArd Biesheuvel .endm 2506c1b0da1SArd Biesheuvel 2516c1b0da1SArd Biesheuvel .macro __pmull_p64, rd, rn, rm, n 2526c1b0da1SArd Biesheuvel .ifb \n 2536c1b0da1SArd Biesheuvel pmull \rd\().1q, \rn\().1d, \rm\().1d 2546c1b0da1SArd Biesheuvel .else 2556c1b0da1SArd Biesheuvel pmull2 \rd\().1q, \rn\().2d, \rm\().2d 2566c1b0da1SArd Biesheuvel .endif 2576c1b0da1SArd Biesheuvel .endm 2586c1b0da1SArd Biesheuvel 2596c1b0da1SArd Biesheuvel .macro crc_t10dif_pmull, p 2602fffee53SArd Biesheuvel __pmull_init_\p 2612fffee53SArd Biesheuvel 2626227cd12SEric Biggers // For sizes less than 256 bytes, we can't fold 128 bytes at a time. 2636227cd12SEric Biggers cmp len, #256 2646227cd12SEric Biggers b.lt .Lless_than_256_bytes_\@ 2656ef5737fSArd Biesheuvel 2666227cd12SEric Biggers adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts 2676ef5737fSArd Biesheuvel 2686227cd12SEric Biggers // Load the first 128 data bytes. Byte swapping is necessary to make 2696227cd12SEric Biggers // the bit order match the polynomial coefficient order. 2706227cd12SEric Biggers ldp q0, q1, [buf] 2716227cd12SEric Biggers ldp q2, q3, [buf, #0x20] 2726227cd12SEric Biggers ldp q4, q5, [buf, #0x40] 2736227cd12SEric Biggers ldp q6, q7, [buf, #0x60] 2746227cd12SEric Biggers add buf, buf, #0x80 2756ef5737fSArd BiesheuvelCPU_LE( rev64 v0.16b, v0.16b ) 2766ef5737fSArd BiesheuvelCPU_LE( rev64 v1.16b, v1.16b ) 2776ef5737fSArd BiesheuvelCPU_LE( rev64 v2.16b, v2.16b ) 2786ef5737fSArd BiesheuvelCPU_LE( rev64 v3.16b, v3.16b ) 2796ef5737fSArd BiesheuvelCPU_LE( rev64 v4.16b, v4.16b ) 2806ef5737fSArd BiesheuvelCPU_LE( rev64 v5.16b, v5.16b ) 2816ef5737fSArd BiesheuvelCPU_LE( rev64 v6.16b, v6.16b ) 2826ef5737fSArd BiesheuvelCPU_LE( rev64 v7.16b, v7.16b ) 2836ef5737fSArd BiesheuvelCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 2846ef5737fSArd BiesheuvelCPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 2856ef5737fSArd BiesheuvelCPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) 2866ef5737fSArd BiesheuvelCPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) 2876ef5737fSArd BiesheuvelCPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) 2886ef5737fSArd BiesheuvelCPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) 2896ef5737fSArd BiesheuvelCPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) 2906ef5737fSArd BiesheuvelCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 2916ef5737fSArd Biesheuvel 2926227cd12SEric Biggers // XOR the first 16 data *bits* with the initial CRC value. 2936227cd12SEric Biggers movi v8.16b, #0 2946227cd12SEric Biggers mov v8.h[7], init_crc 2956227cd12SEric Biggers eor v0.16b, v0.16b, v8.16b 2966ef5737fSArd Biesheuvel 2976227cd12SEric Biggers // Load the constants for folding across 128 bytes. 2986227cd12SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr] 2996227cd12SEric Biggers __pmull_pre_\p fold_consts 3006ef5737fSArd Biesheuvel 3016227cd12SEric Biggers // Subtract 128 for the 128 data bytes just consumed. Subtract another 3026227cd12SEric Biggers // 128 to simplify the termination condition of the following loop. 3036227cd12SEric Biggers sub len, len, #256 3046ef5737fSArd Biesheuvel 3056227cd12SEric Biggers // While >= 128 data bytes remain (not counting v0-v7), fold the 128 3066227cd12SEric Biggers // bytes v0-v7 into them, storing the result back into v0-v7. 3076227cd12SEric Biggers.Lfold_128_bytes_loop_\@: 3086227cd12SEric Biggers fold_32_bytes \p, v0, v1 3096227cd12SEric Biggers fold_32_bytes \p, v2, v3 3106227cd12SEric Biggers fold_32_bytes \p, v4, v5 3116227cd12SEric Biggers fold_32_bytes \p, v6, v7 3126ef5737fSArd Biesheuvel 3136227cd12SEric Biggers subs len, len, #128 314fc754c02SArd Biesheuvel b.ge .Lfold_128_bytes_loop_\@ 3156ef5737fSArd Biesheuvel 3166227cd12SEric Biggers // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. 3176ef5737fSArd Biesheuvel 3186227cd12SEric Biggers // Fold across 64 bytes. 3196227cd12SEric Biggers add fold_consts_ptr, fold_consts_ptr, #16 3206227cd12SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 3216227cd12SEric Biggers __pmull_pre_\p fold_consts 3226227cd12SEric Biggers fold_16_bytes \p, v0, v4 3236227cd12SEric Biggers fold_16_bytes \p, v1, v5 3246227cd12SEric Biggers fold_16_bytes \p, v2, v6 3256227cd12SEric Biggers fold_16_bytes \p, v3, v7, 1 3266227cd12SEric Biggers // Fold across 32 bytes. 3276227cd12SEric Biggers fold_16_bytes \p, v4, v6 3286227cd12SEric Biggers fold_16_bytes \p, v5, v7, 1 3296227cd12SEric Biggers // Fold across 16 bytes. 3306227cd12SEric Biggers fold_16_bytes \p, v6, v7 3316ef5737fSArd Biesheuvel 3326227cd12SEric Biggers // Add 128 to get the correct number of data bytes remaining in 0...127 3336227cd12SEric Biggers // (not counting v7), following the previous extra subtraction by 128. 3346227cd12SEric Biggers // Then subtract 16 to simplify the termination condition of the 3356227cd12SEric Biggers // following loop. 3366227cd12SEric Biggers adds len, len, #(128-16) 3376ef5737fSArd Biesheuvel 3386227cd12SEric Biggers // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 3396227cd12SEric Biggers // into them, storing the result back into v7. 3406227cd12SEric Biggers b.lt .Lfold_16_bytes_loop_done_\@ 3416227cd12SEric Biggers.Lfold_16_bytes_loop_\@: 3426227cd12SEric Biggers __pmull_\p v8, v7, fold_consts 3436227cd12SEric Biggers __pmull_\p v7, v7, fold_consts, 2 3446ef5737fSArd Biesheuvel eor v7.16b, v7.16b, v8.16b 3456227cd12SEric Biggers ldr q0, [buf], #16 3466ef5737fSArd BiesheuvelCPU_LE( rev64 v0.16b, v0.16b ) 3476ef5737fSArd BiesheuvelCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 3486ef5737fSArd Biesheuvel eor v7.16b, v7.16b, v0.16b 3496227cd12SEric Biggers subs len, len, #16 3506227cd12SEric Biggers b.ge .Lfold_16_bytes_loop_\@ 3516ef5737fSArd Biesheuvel 3526227cd12SEric Biggers.Lfold_16_bytes_loop_done_\@: 3536227cd12SEric Biggers // Add 16 to get the correct number of data bytes remaining in 0...15 3546227cd12SEric Biggers // (not counting v7), following the previous extra subtraction by 16. 3556227cd12SEric Biggers adds len, len, #16 3566227cd12SEric Biggers b.eq .Lreduce_final_16_bytes_\@ 3576ef5737fSArd Biesheuvel 3586227cd12SEric Biggers.Lhandle_partial_segment_\@: 3596227cd12SEric Biggers // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 3606227cd12SEric Biggers // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To 3616227cd12SEric Biggers // do this without needing a fold constant for each possible 'len', 3626227cd12SEric Biggers // redivide the bytes into a first chunk of 'len' bytes and a second 3636227cd12SEric Biggers // chunk of 16 bytes, then fold the first chunk into the second. 3646ef5737fSArd Biesheuvel 3656227cd12SEric Biggers // v0 = last 16 original data bytes 3666227cd12SEric Biggers add buf, buf, len 3676227cd12SEric Biggers ldr q0, [buf, #-16] 3686227cd12SEric BiggersCPU_LE( rev64 v0.16b, v0.16b ) 3696227cd12SEric BiggersCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 3706ef5737fSArd Biesheuvel 3716227cd12SEric Biggers // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. 3726227cd12SEric Biggers adr_l x4, .Lbyteshift_table + 16 3736227cd12SEric Biggers sub x4, x4, len 3746227cd12SEric Biggers ld1 {v2.16b}, [x4] 3756227cd12SEric Biggers tbl v1.16b, {v7.16b}, v2.16b 3766ef5737fSArd Biesheuvel 3776227cd12SEric Biggers // v3 = first chunk: v7 right-shifted by '16-len' bytes. 3786227cd12SEric Biggers movi v3.16b, #0x80 3796227cd12SEric Biggers eor v2.16b, v2.16b, v3.16b 3806227cd12SEric Biggers tbl v3.16b, {v7.16b}, v2.16b 3816ef5737fSArd Biesheuvel 3826227cd12SEric Biggers // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. 3836227cd12SEric Biggers sshr v2.16b, v2.16b, #7 3846ef5737fSArd Biesheuvel 3856227cd12SEric Biggers // v2 = second chunk: 'len' bytes from v0 (low-order bytes), 3866227cd12SEric Biggers // then '16-len' bytes from v1 (high-order bytes). 3876227cd12SEric Biggers bsl v2.16b, v1.16b, v0.16b 3886ef5737fSArd Biesheuvel 3896227cd12SEric Biggers // Fold the first chunk into the second chunk, storing the result in v7. 3906227cd12SEric Biggers __pmull_\p v0, v3, fold_consts 3916227cd12SEric Biggers __pmull_\p v7, v3, fold_consts, 2 3926ef5737fSArd Biesheuvel eor v7.16b, v7.16b, v0.16b 3936227cd12SEric Biggers eor v7.16b, v7.16b, v2.16b 3946ef5737fSArd Biesheuvel 3956227cd12SEric Biggers.Lreduce_final_16_bytes_\@: 3966227cd12SEric Biggers // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. 3976ef5737fSArd Biesheuvel 3986227cd12SEric Biggers movi v2.16b, #0 // init zero register 3996ef5737fSArd Biesheuvel 4006227cd12SEric Biggers // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 4016227cd12SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 4026227cd12SEric Biggers __pmull_pre_\p fold_consts 4036ef5737fSArd Biesheuvel 4046227cd12SEric Biggers // Fold the high 64 bits into the low 64 bits, while also multiplying by 4056227cd12SEric Biggers // x^64. This produces a 128-bit value congruent to x^64 * M(x) and 4066227cd12SEric Biggers // whose low 48 bits are 0. 4076227cd12SEric Biggers ext v0.16b, v2.16b, v7.16b, #8 4086227cd12SEric Biggers __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x)) 4096227cd12SEric Biggers eor v0.16b, v0.16b, v7.16b // + low bits * x^64 4106ef5737fSArd Biesheuvel 4116227cd12SEric Biggers // Fold the high 32 bits into the low 96 bits. This produces a 96-bit 4126227cd12SEric Biggers // value congruent to x^64 * M(x) and whose low 48 bits are 0. 4136227cd12SEric Biggers ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits 4146227cd12SEric Biggers mov v0.s[3], v2.s[0] // zero high 32 bits 4156227cd12SEric Biggers __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x)) 4166227cd12SEric Biggers eor v0.16b, v0.16b, v1.16b // + low bits 4176ef5737fSArd Biesheuvel 4186227cd12SEric Biggers // Load G(x) and floor(x^48 / G(x)). 4196227cd12SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr] 4206227cd12SEric Biggers __pmull_pre_\p fold_consts 4216227cd12SEric Biggers 4226227cd12SEric Biggers // Use Barrett reduction to compute the final CRC value. 4236227cd12SEric Biggers __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x)) 4246227cd12SEric Biggers ushr v1.2d, v1.2d, #32 // /= x^32 4256227cd12SEric Biggers __pmull_\p v1, v1, fold_consts // *= G(x) 4266227cd12SEric Biggers ushr v0.2d, v0.2d, #48 4276227cd12SEric Biggers eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits 4286227cd12SEric Biggers // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. 4296227cd12SEric Biggers 4306227cd12SEric Biggers umov w0, v0.h[0] 431fc754c02SArd Biesheuvel .ifc \p, p8 432*489a4a05SArd Biesheuvel frame_pop 433fc754c02SArd Biesheuvel .endif 4346ef5737fSArd Biesheuvel ret 4356ef5737fSArd Biesheuvel 4366227cd12SEric Biggers.Lless_than_256_bytes_\@: 4376227cd12SEric Biggers // Checksumming a buffer of length 16...255 bytes 4386ef5737fSArd Biesheuvel 4396227cd12SEric Biggers adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts 4406ef5737fSArd Biesheuvel 4416227cd12SEric Biggers // Load the first 16 data bytes. 4426227cd12SEric Biggers ldr q7, [buf], #0x10 4436ef5737fSArd BiesheuvelCPU_LE( rev64 v7.16b, v7.16b ) 4446ef5737fSArd BiesheuvelCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 4456ef5737fSArd Biesheuvel 4466227cd12SEric Biggers // XOR the first 16 data *bits* with the initial CRC value. 4476227cd12SEric Biggers movi v0.16b, #0 4486227cd12SEric Biggers mov v0.h[7], init_crc 4496227cd12SEric Biggers eor v7.16b, v7.16b, v0.16b 4506ef5737fSArd Biesheuvel 4516227cd12SEric Biggers // Load the fold-across-16-bytes constants. 4526227cd12SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 4536227cd12SEric Biggers __pmull_pre_\p fold_consts 4546ef5737fSArd Biesheuvel 4556227cd12SEric Biggers cmp len, #16 4566227cd12SEric Biggers b.eq .Lreduce_final_16_bytes_\@ // len == 16 4576227cd12SEric Biggers subs len, len, #32 4586227cd12SEric Biggers b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 4596227cd12SEric Biggers add len, len, #16 4606227cd12SEric Biggers b .Lhandle_partial_segment_\@ // 17 <= len <= 31 4616c1b0da1SArd Biesheuvel .endm 4626c1b0da1SArd Biesheuvel 4636227cd12SEric Biggers// 4646227cd12SEric Biggers// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); 4656227cd12SEric Biggers// 4666227cd12SEric Biggers// Assumes len >= 16. 4676227cd12SEric Biggers// 4680e89640bSMark BrownSYM_FUNC_START(crc_t10dif_pmull_p8) 469*489a4a05SArd Biesheuvel frame_push 1 4702fffee53SArd Biesheuvel crc_t10dif_pmull p8 4710e89640bSMark BrownSYM_FUNC_END(crc_t10dif_pmull_p8) 4722fffee53SArd Biesheuvel 4732fffee53SArd Biesheuvel .align 5 4746227cd12SEric Biggers// 4756227cd12SEric Biggers// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); 4766227cd12SEric Biggers// 4776227cd12SEric Biggers// Assumes len >= 16. 4786227cd12SEric Biggers// 4790e89640bSMark BrownSYM_FUNC_START(crc_t10dif_pmull_p64) 4806c1b0da1SArd Biesheuvel crc_t10dif_pmull p64 4810e89640bSMark BrownSYM_FUNC_END(crc_t10dif_pmull_p64) 4826ef5737fSArd Biesheuvel 483325f562dSArd Biesheuvel .section ".rodata", "a" 4846ef5737fSArd Biesheuvel .align 4 4856ef5737fSArd Biesheuvel 4866227cd12SEric Biggers// Fold constants precomputed from the polynomial 0x18bb7 4876227cd12SEric Biggers// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 4886227cd12SEric Biggers.Lfold_across_128_bytes_consts: 4896227cd12SEric Biggers .quad 0x0000000000006123 // x^(8*128) mod G(x) 4906227cd12SEric Biggers .quad 0x0000000000002295 // x^(8*128+64) mod G(x) 4916227cd12SEric Biggers// .Lfold_across_64_bytes_consts: 4926227cd12SEric Biggers .quad 0x0000000000001069 // x^(4*128) mod G(x) 4936227cd12SEric Biggers .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) 4946227cd12SEric Biggers// .Lfold_across_32_bytes_consts: 4956227cd12SEric Biggers .quad 0x000000000000857d // x^(2*128) mod G(x) 4966227cd12SEric Biggers .quad 0x0000000000007acc // x^(2*128+64) mod G(x) 4976227cd12SEric Biggers.Lfold_across_16_bytes_consts: 4986227cd12SEric Biggers .quad 0x000000000000a010 // x^(1*128) mod G(x) 4996227cd12SEric Biggers .quad 0x0000000000001faa // x^(1*128+64) mod G(x) 5006227cd12SEric Biggers// .Lfinal_fold_consts: 5016227cd12SEric Biggers .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) 5026227cd12SEric Biggers .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) 5036227cd12SEric Biggers// .Lbarrett_reduction_consts: 5046227cd12SEric Biggers .quad 0x0000000000018bb7 // G(x) 5056227cd12SEric Biggers .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) 5066ef5737fSArd Biesheuvel 5076227cd12SEric Biggers// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - 5086227cd12SEric Biggers// len] is the index vector to shift left by 'len' bytes, and is also {0x80, 5096227cd12SEric Biggers// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. 5106227cd12SEric Biggers.Lbyteshift_table: 5116ef5737fSArd Biesheuvel .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 5126ef5737fSArd Biesheuvel .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 5136ef5737fSArd Biesheuvel .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 5146ef5737fSArd Biesheuvel .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 515