1*9d2c0b48SNathan Huckleberry/* SPDX-License-Identifier: GPL-2.0 */ 2*9d2c0b48SNathan Huckleberry/* 3*9d2c0b48SNathan Huckleberry * Implementation of POLYVAL using ARMv8 Crypto Extensions. 4*9d2c0b48SNathan Huckleberry * 5*9d2c0b48SNathan Huckleberry * Copyright 2021 Google LLC 6*9d2c0b48SNathan Huckleberry */ 7*9d2c0b48SNathan Huckleberry/* 8*9d2c0b48SNathan Huckleberry * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions 9*9d2c0b48SNathan Huckleberry * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, 10*9d2c0b48SNathan Huckleberry * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split 11*9d2c0b48SNathan Huckleberry * finite field multiplication into two steps. 12*9d2c0b48SNathan Huckleberry * 13*9d2c0b48SNathan Huckleberry * In the first step, we consider h^i, m_i as normal polynomials of degree less 14*9d2c0b48SNathan Huckleberry * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication 15*9d2c0b48SNathan Huckleberry * is simply polynomial multiplication. 16*9d2c0b48SNathan Huckleberry * 17*9d2c0b48SNathan Huckleberry * In the second step, we compute the reduction of p(x) modulo the finite field 18*9d2c0b48SNathan Huckleberry * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. 19*9d2c0b48SNathan Huckleberry * 20*9d2c0b48SNathan Huckleberry * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where 21*9d2c0b48SNathan Huckleberry * multiplication is finite field multiplication. The advantage is that the 22*9d2c0b48SNathan Huckleberry * two-step process only requires 1 finite field reduction for every 8 23*9d2c0b48SNathan Huckleberry * polynomial multiplications. Further parallelism is gained by interleaving the 24*9d2c0b48SNathan Huckleberry * multiplications and polynomial reductions. 25*9d2c0b48SNathan Huckleberry */ 26*9d2c0b48SNathan Huckleberry 27*9d2c0b48SNathan Huckleberry#include <linux/linkage.h> 28*9d2c0b48SNathan Huckleberry#define STRIDE_BLOCKS 8 29*9d2c0b48SNathan Huckleberry 30*9d2c0b48SNathan HuckleberryKEY_POWERS .req x0 31*9d2c0b48SNathan HuckleberryMSG .req x1 32*9d2c0b48SNathan HuckleberryBLOCKS_LEFT .req x2 33*9d2c0b48SNathan HuckleberryACCUMULATOR .req x3 34*9d2c0b48SNathan HuckleberryKEY_START .req x10 35*9d2c0b48SNathan HuckleberryEXTRA_BYTES .req x11 36*9d2c0b48SNathan HuckleberryTMP .req x13 37*9d2c0b48SNathan Huckleberry 38*9d2c0b48SNathan HuckleberryM0 .req v0 39*9d2c0b48SNathan HuckleberryM1 .req v1 40*9d2c0b48SNathan HuckleberryM2 .req v2 41*9d2c0b48SNathan HuckleberryM3 .req v3 42*9d2c0b48SNathan HuckleberryM4 .req v4 43*9d2c0b48SNathan HuckleberryM5 .req v5 44*9d2c0b48SNathan HuckleberryM6 .req v6 45*9d2c0b48SNathan HuckleberryM7 .req v7 46*9d2c0b48SNathan HuckleberryKEY8 .req v8 47*9d2c0b48SNathan HuckleberryKEY7 .req v9 48*9d2c0b48SNathan HuckleberryKEY6 .req v10 49*9d2c0b48SNathan HuckleberryKEY5 .req v11 50*9d2c0b48SNathan HuckleberryKEY4 .req v12 51*9d2c0b48SNathan HuckleberryKEY3 .req v13 52*9d2c0b48SNathan HuckleberryKEY2 .req v14 53*9d2c0b48SNathan HuckleberryKEY1 .req v15 54*9d2c0b48SNathan HuckleberryPL .req v16 55*9d2c0b48SNathan HuckleberryPH .req v17 56*9d2c0b48SNathan HuckleberryTMP_V .req v18 57*9d2c0b48SNathan HuckleberryLO .req v20 58*9d2c0b48SNathan HuckleberryMI .req v21 59*9d2c0b48SNathan HuckleberryHI .req v22 60*9d2c0b48SNathan HuckleberrySUM .req v23 61*9d2c0b48SNathan HuckleberryGSTAR .req v24 62*9d2c0b48SNathan Huckleberry 63*9d2c0b48SNathan Huckleberry .text 64*9d2c0b48SNathan Huckleberry 65*9d2c0b48SNathan Huckleberry .arch armv8-a+crypto 66*9d2c0b48SNathan Huckleberry .align 4 67*9d2c0b48SNathan Huckleberry 68*9d2c0b48SNathan Huckleberry.Lgstar: 69*9d2c0b48SNathan Huckleberry .quad 0xc200000000000000, 0xc200000000000000 70*9d2c0b48SNathan Huckleberry 71*9d2c0b48SNathan Huckleberry/* 72*9d2c0b48SNathan Huckleberry * Computes the product of two 128-bit polynomials in X and Y and XORs the 73*9d2c0b48SNathan Huckleberry * components of the 256-bit product into LO, MI, HI. 74*9d2c0b48SNathan Huckleberry * 75*9d2c0b48SNathan Huckleberry * Given: 76*9d2c0b48SNathan Huckleberry * X = [X_1 : X_0] 77*9d2c0b48SNathan Huckleberry * Y = [Y_1 : Y_0] 78*9d2c0b48SNathan Huckleberry * 79*9d2c0b48SNathan Huckleberry * We compute: 80*9d2c0b48SNathan Huckleberry * LO += X_0 * Y_0 81*9d2c0b48SNathan Huckleberry * MI += (X_0 + X_1) * (Y_0 + Y_1) 82*9d2c0b48SNathan Huckleberry * HI += X_1 * Y_1 83*9d2c0b48SNathan Huckleberry * 84*9d2c0b48SNathan Huckleberry * Later, the 256-bit result can be extracted as: 85*9d2c0b48SNathan Huckleberry * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] 86*9d2c0b48SNathan Huckleberry * This step is done when computing the polynomial reduction for efficiency 87*9d2c0b48SNathan Huckleberry * reasons. 88*9d2c0b48SNathan Huckleberry * 89*9d2c0b48SNathan Huckleberry * Karatsuba multiplication is used instead of Schoolbook multiplication because 90*9d2c0b48SNathan Huckleberry * it was found to be slightly faster on ARM64 CPUs. 91*9d2c0b48SNathan Huckleberry * 92*9d2c0b48SNathan Huckleberry */ 93*9d2c0b48SNathan Huckleberry.macro karatsuba1 X Y 94*9d2c0b48SNathan Huckleberry X .req \X 95*9d2c0b48SNathan Huckleberry Y .req \Y 96*9d2c0b48SNathan Huckleberry ext v25.16b, X.16b, X.16b, #8 97*9d2c0b48SNathan Huckleberry ext v26.16b, Y.16b, Y.16b, #8 98*9d2c0b48SNathan Huckleberry eor v25.16b, v25.16b, X.16b 99*9d2c0b48SNathan Huckleberry eor v26.16b, v26.16b, Y.16b 100*9d2c0b48SNathan Huckleberry pmull2 v28.1q, X.2d, Y.2d 101*9d2c0b48SNathan Huckleberry pmull v29.1q, X.1d, Y.1d 102*9d2c0b48SNathan Huckleberry pmull v27.1q, v25.1d, v26.1d 103*9d2c0b48SNathan Huckleberry eor HI.16b, HI.16b, v28.16b 104*9d2c0b48SNathan Huckleberry eor LO.16b, LO.16b, v29.16b 105*9d2c0b48SNathan Huckleberry eor MI.16b, MI.16b, v27.16b 106*9d2c0b48SNathan Huckleberry .unreq X 107*9d2c0b48SNathan Huckleberry .unreq Y 108*9d2c0b48SNathan Huckleberry.endm 109*9d2c0b48SNathan Huckleberry 110*9d2c0b48SNathan Huckleberry/* 111*9d2c0b48SNathan Huckleberry * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into 112*9d2c0b48SNathan Huckleberry * them. 113*9d2c0b48SNathan Huckleberry */ 114*9d2c0b48SNathan Huckleberry.macro karatsuba1_store X Y 115*9d2c0b48SNathan Huckleberry X .req \X 116*9d2c0b48SNathan Huckleberry Y .req \Y 117*9d2c0b48SNathan Huckleberry ext v25.16b, X.16b, X.16b, #8 118*9d2c0b48SNathan Huckleberry ext v26.16b, Y.16b, Y.16b, #8 119*9d2c0b48SNathan Huckleberry eor v25.16b, v25.16b, X.16b 120*9d2c0b48SNathan Huckleberry eor v26.16b, v26.16b, Y.16b 121*9d2c0b48SNathan Huckleberry pmull2 HI.1q, X.2d, Y.2d 122*9d2c0b48SNathan Huckleberry pmull LO.1q, X.1d, Y.1d 123*9d2c0b48SNathan Huckleberry pmull MI.1q, v25.1d, v26.1d 124*9d2c0b48SNathan Huckleberry .unreq X 125*9d2c0b48SNathan Huckleberry .unreq Y 126*9d2c0b48SNathan Huckleberry.endm 127*9d2c0b48SNathan Huckleberry 128*9d2c0b48SNathan Huckleberry/* 129*9d2c0b48SNathan Huckleberry * Computes the 256-bit polynomial represented by LO, HI, MI. Stores 130*9d2c0b48SNathan Huckleberry * the result in PL, PH. 131*9d2c0b48SNathan Huckleberry * [PH : PL] = 132*9d2c0b48SNathan Huckleberry * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] 133*9d2c0b48SNathan Huckleberry */ 134*9d2c0b48SNathan Huckleberry.macro karatsuba2 135*9d2c0b48SNathan Huckleberry // v4 = [HI_1 + MI_1 : HI_0 + MI_0] 136*9d2c0b48SNathan Huckleberry eor v4.16b, HI.16b, MI.16b 137*9d2c0b48SNathan Huckleberry // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] 138*9d2c0b48SNathan Huckleberry eor v4.16b, v4.16b, LO.16b 139*9d2c0b48SNathan Huckleberry // v5 = [HI_0 : LO_1] 140*9d2c0b48SNathan Huckleberry ext v5.16b, LO.16b, HI.16b, #8 141*9d2c0b48SNathan Huckleberry // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] 142*9d2c0b48SNathan Huckleberry eor v4.16b, v4.16b, v5.16b 143*9d2c0b48SNathan Huckleberry // HI = [HI_0 : HI_1] 144*9d2c0b48SNathan Huckleberry ext HI.16b, HI.16b, HI.16b, #8 145*9d2c0b48SNathan Huckleberry // LO = [LO_0 : LO_1] 146*9d2c0b48SNathan Huckleberry ext LO.16b, LO.16b, LO.16b, #8 147*9d2c0b48SNathan Huckleberry // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] 148*9d2c0b48SNathan Huckleberry ext PH.16b, v4.16b, HI.16b, #8 149*9d2c0b48SNathan Huckleberry // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] 150*9d2c0b48SNathan Huckleberry ext PL.16b, LO.16b, v4.16b, #8 151*9d2c0b48SNathan Huckleberry.endm 152*9d2c0b48SNathan Huckleberry 153*9d2c0b48SNathan Huckleberry/* 154*9d2c0b48SNathan Huckleberry * Computes the 128-bit reduction of PH : PL. Stores the result in dest. 155*9d2c0b48SNathan Huckleberry * 156*9d2c0b48SNathan Huckleberry * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = 157*9d2c0b48SNathan Huckleberry * x^128 + x^127 + x^126 + x^121 + 1. 158*9d2c0b48SNathan Huckleberry * 159*9d2c0b48SNathan Huckleberry * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the 160*9d2c0b48SNathan Huckleberry * product of two 128-bit polynomials in Montgomery form. We need to reduce it 161*9d2c0b48SNathan Huckleberry * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor 162*9d2c0b48SNathan Huckleberry * of x^128, this product has two extra factors of x^128. To get it back into 163*9d2c0b48SNathan Huckleberry * Montgomery form, we need to remove one of these factors by dividing by x^128. 164*9d2c0b48SNathan Huckleberry * 165*9d2c0b48SNathan Huckleberry * To accomplish both of these goals, we add multiples of g(x) that cancel out 166*9d2c0b48SNathan Huckleberry * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low 167*9d2c0b48SNathan Huckleberry * bits are zero, the polynomial division by x^128 can be done by right 168*9d2c0b48SNathan Huckleberry * shifting. 169*9d2c0b48SNathan Huckleberry * 170*9d2c0b48SNathan Huckleberry * Since the only nonzero term in the low 64 bits of g(x) is the constant term, 171*9d2c0b48SNathan Huckleberry * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can 172*9d2c0b48SNathan Huckleberry * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + 173*9d2c0b48SNathan Huckleberry * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to 174*9d2c0b48SNathan Huckleberry * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T 175*9d2c0b48SNathan Huckleberry * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. 176*9d2c0b48SNathan Huckleberry * 177*9d2c0b48SNathan Huckleberry * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits 178*9d2c0b48SNathan Huckleberry * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 179*9d2c0b48SNathan Huckleberry * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * 180*9d2c0b48SNathan Huckleberry * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : 181*9d2c0b48SNathan Huckleberry * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). 182*9d2c0b48SNathan Huckleberry * 183*9d2c0b48SNathan Huckleberry * So our final computation is: 184*9d2c0b48SNathan Huckleberry * T = T_1 : T_0 = g*(x) * P_0 185*9d2c0b48SNathan Huckleberry * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 186*9d2c0b48SNathan Huckleberry * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 187*9d2c0b48SNathan Huckleberry * 188*9d2c0b48SNathan Huckleberry * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 189*9d2c0b48SNathan Huckleberry * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : 190*9d2c0b48SNathan Huckleberry * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. 191*9d2c0b48SNathan Huckleberry */ 192*9d2c0b48SNathan Huckleberry.macro montgomery_reduction dest 193*9d2c0b48SNathan Huckleberry DEST .req \dest 194*9d2c0b48SNathan Huckleberry // TMP_V = T_1 : T_0 = P_0 * g*(x) 195*9d2c0b48SNathan Huckleberry pmull TMP_V.1q, PL.1d, GSTAR.1d 196*9d2c0b48SNathan Huckleberry // TMP_V = T_0 : T_1 197*9d2c0b48SNathan Huckleberry ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 198*9d2c0b48SNathan Huckleberry // TMP_V = P_1 + T_0 : P_0 + T_1 199*9d2c0b48SNathan Huckleberry eor TMP_V.16b, PL.16b, TMP_V.16b 200*9d2c0b48SNathan Huckleberry // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 201*9d2c0b48SNathan Huckleberry eor PH.16b, PH.16b, TMP_V.16b 202*9d2c0b48SNathan Huckleberry // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) 203*9d2c0b48SNathan Huckleberry pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 204*9d2c0b48SNathan Huckleberry eor DEST.16b, PH.16b, TMP_V.16b 205*9d2c0b48SNathan Huckleberry .unreq DEST 206*9d2c0b48SNathan Huckleberry.endm 207*9d2c0b48SNathan Huckleberry 208*9d2c0b48SNathan Huckleberry/* 209*9d2c0b48SNathan Huckleberry * Compute Polyval on 8 blocks. 210*9d2c0b48SNathan Huckleberry * 211*9d2c0b48SNathan Huckleberry * If reduce is set, also computes the montgomery reduction of the 212*9d2c0b48SNathan Huckleberry * previous full_stride call and XORs with the first message block. 213*9d2c0b48SNathan Huckleberry * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 214*9d2c0b48SNathan Huckleberry * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. 215*9d2c0b48SNathan Huckleberry * 216*9d2c0b48SNathan Huckleberry * Sets PL, PH. 217*9d2c0b48SNathan Huckleberry */ 218*9d2c0b48SNathan Huckleberry.macro full_stride reduce 219*9d2c0b48SNathan Huckleberry eor LO.16b, LO.16b, LO.16b 220*9d2c0b48SNathan Huckleberry eor MI.16b, MI.16b, MI.16b 221*9d2c0b48SNathan Huckleberry eor HI.16b, HI.16b, HI.16b 222*9d2c0b48SNathan Huckleberry 223*9d2c0b48SNathan Huckleberry ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 224*9d2c0b48SNathan Huckleberry ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 225*9d2c0b48SNathan Huckleberry 226*9d2c0b48SNathan Huckleberry karatsuba1 M7 KEY1 227*9d2c0b48SNathan Huckleberry .if \reduce 228*9d2c0b48SNathan Huckleberry pmull TMP_V.1q, PL.1d, GSTAR.1d 229*9d2c0b48SNathan Huckleberry .endif 230*9d2c0b48SNathan Huckleberry 231*9d2c0b48SNathan Huckleberry karatsuba1 M6 KEY2 232*9d2c0b48SNathan Huckleberry .if \reduce 233*9d2c0b48SNathan Huckleberry ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 234*9d2c0b48SNathan Huckleberry .endif 235*9d2c0b48SNathan Huckleberry 236*9d2c0b48SNathan Huckleberry karatsuba1 M5 KEY3 237*9d2c0b48SNathan Huckleberry .if \reduce 238*9d2c0b48SNathan Huckleberry eor TMP_V.16b, PL.16b, TMP_V.16b 239*9d2c0b48SNathan Huckleberry .endif 240*9d2c0b48SNathan Huckleberry 241*9d2c0b48SNathan Huckleberry karatsuba1 M4 KEY4 242*9d2c0b48SNathan Huckleberry .if \reduce 243*9d2c0b48SNathan Huckleberry eor PH.16b, PH.16b, TMP_V.16b 244*9d2c0b48SNathan Huckleberry .endif 245*9d2c0b48SNathan Huckleberry 246*9d2c0b48SNathan Huckleberry karatsuba1 M3 KEY5 247*9d2c0b48SNathan Huckleberry .if \reduce 248*9d2c0b48SNathan Huckleberry pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 249*9d2c0b48SNathan Huckleberry .endif 250*9d2c0b48SNathan Huckleberry 251*9d2c0b48SNathan Huckleberry karatsuba1 M2 KEY6 252*9d2c0b48SNathan Huckleberry .if \reduce 253*9d2c0b48SNathan Huckleberry eor SUM.16b, PH.16b, TMP_V.16b 254*9d2c0b48SNathan Huckleberry .endif 255*9d2c0b48SNathan Huckleberry 256*9d2c0b48SNathan Huckleberry karatsuba1 M1 KEY7 257*9d2c0b48SNathan Huckleberry eor M0.16b, M0.16b, SUM.16b 258*9d2c0b48SNathan Huckleberry 259*9d2c0b48SNathan Huckleberry karatsuba1 M0 KEY8 260*9d2c0b48SNathan Huckleberry karatsuba2 261*9d2c0b48SNathan Huckleberry.endm 262*9d2c0b48SNathan Huckleberry 263*9d2c0b48SNathan Huckleberry/* 264*9d2c0b48SNathan Huckleberry * Handle any extra blocks after full_stride loop. 265*9d2c0b48SNathan Huckleberry */ 266*9d2c0b48SNathan Huckleberry.macro partial_stride 267*9d2c0b48SNathan Huckleberry add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) 268*9d2c0b48SNathan Huckleberry sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 269*9d2c0b48SNathan Huckleberry ld1 {KEY1.16b}, [KEY_POWERS], #16 270*9d2c0b48SNathan Huckleberry 271*9d2c0b48SNathan Huckleberry ld1 {TMP_V.16b}, [MSG], #16 272*9d2c0b48SNathan Huckleberry eor SUM.16b, SUM.16b, TMP_V.16b 273*9d2c0b48SNathan Huckleberry karatsuba1_store KEY1 SUM 274*9d2c0b48SNathan Huckleberry sub BLOCKS_LEFT, BLOCKS_LEFT, #1 275*9d2c0b48SNathan Huckleberry 276*9d2c0b48SNathan Huckleberry tst BLOCKS_LEFT, #4 277*9d2c0b48SNathan Huckleberry beq .Lpartial4BlocksDone 278*9d2c0b48SNathan Huckleberry ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 279*9d2c0b48SNathan Huckleberry ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 280*9d2c0b48SNathan Huckleberry karatsuba1 M0 KEY8 281*9d2c0b48SNathan Huckleberry karatsuba1 M1 KEY7 282*9d2c0b48SNathan Huckleberry karatsuba1 M2 KEY6 283*9d2c0b48SNathan Huckleberry karatsuba1 M3 KEY5 284*9d2c0b48SNathan Huckleberry.Lpartial4BlocksDone: 285*9d2c0b48SNathan Huckleberry tst BLOCKS_LEFT, #2 286*9d2c0b48SNathan Huckleberry beq .Lpartial2BlocksDone 287*9d2c0b48SNathan Huckleberry ld1 {M0.16b, M1.16b}, [MSG], #32 288*9d2c0b48SNathan Huckleberry ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 289*9d2c0b48SNathan Huckleberry karatsuba1 M0 KEY8 290*9d2c0b48SNathan Huckleberry karatsuba1 M1 KEY7 291*9d2c0b48SNathan Huckleberry.Lpartial2BlocksDone: 292*9d2c0b48SNathan Huckleberry tst BLOCKS_LEFT, #1 293*9d2c0b48SNathan Huckleberry beq .LpartialDone 294*9d2c0b48SNathan Huckleberry ld1 {M0.16b}, [MSG], #16 295*9d2c0b48SNathan Huckleberry ld1 {KEY8.16b}, [KEY_POWERS], #16 296*9d2c0b48SNathan Huckleberry karatsuba1 M0 KEY8 297*9d2c0b48SNathan Huckleberry.LpartialDone: 298*9d2c0b48SNathan Huckleberry karatsuba2 299*9d2c0b48SNathan Huckleberry montgomery_reduction SUM 300*9d2c0b48SNathan Huckleberry.endm 301*9d2c0b48SNathan Huckleberry 302*9d2c0b48SNathan Huckleberry/* 303*9d2c0b48SNathan Huckleberry * Perform montgomery multiplication in GF(2^128) and store result in op1. 304*9d2c0b48SNathan Huckleberry * 305*9d2c0b48SNathan Huckleberry * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 306*9d2c0b48SNathan Huckleberry * If op1, op2 are in montgomery form, this computes the montgomery 307*9d2c0b48SNathan Huckleberry * form of op1*op2. 308*9d2c0b48SNathan Huckleberry * 309*9d2c0b48SNathan Huckleberry * void pmull_polyval_mul(u8 *op1, const u8 *op2); 310*9d2c0b48SNathan Huckleberry */ 311*9d2c0b48SNathan HuckleberrySYM_FUNC_START(pmull_polyval_mul) 312*9d2c0b48SNathan Huckleberry adr TMP, .Lgstar 313*9d2c0b48SNathan Huckleberry ld1 {GSTAR.2d}, [TMP] 314*9d2c0b48SNathan Huckleberry ld1 {v0.16b}, [x0] 315*9d2c0b48SNathan Huckleberry ld1 {v1.16b}, [x1] 316*9d2c0b48SNathan Huckleberry karatsuba1_store v0 v1 317*9d2c0b48SNathan Huckleberry karatsuba2 318*9d2c0b48SNathan Huckleberry montgomery_reduction SUM 319*9d2c0b48SNathan Huckleberry st1 {SUM.16b}, [x0] 320*9d2c0b48SNathan Huckleberry ret 321*9d2c0b48SNathan HuckleberrySYM_FUNC_END(pmull_polyval_mul) 322*9d2c0b48SNathan Huckleberry 323*9d2c0b48SNathan Huckleberry/* 324*9d2c0b48SNathan Huckleberry * Perform polynomial evaluation as specified by POLYVAL. This computes: 325*9d2c0b48SNathan Huckleberry * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} 326*9d2c0b48SNathan Huckleberry * where n=nblocks, h is the hash key, and m_i are the message blocks. 327*9d2c0b48SNathan Huckleberry * 328*9d2c0b48SNathan Huckleberry * x0 - pointer to precomputed key powers h^8 ... h^1 329*9d2c0b48SNathan Huckleberry * x1 - pointer to message blocks 330*9d2c0b48SNathan Huckleberry * x2 - number of blocks to hash 331*9d2c0b48SNathan Huckleberry * x3 - pointer to accumulator 332*9d2c0b48SNathan Huckleberry * 333*9d2c0b48SNathan Huckleberry * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, 334*9d2c0b48SNathan Huckleberry * size_t nblocks, u8 *accumulator); 335*9d2c0b48SNathan Huckleberry */ 336*9d2c0b48SNathan HuckleberrySYM_FUNC_START(pmull_polyval_update) 337*9d2c0b48SNathan Huckleberry adr TMP, .Lgstar 338*9d2c0b48SNathan Huckleberry mov KEY_START, KEY_POWERS 339*9d2c0b48SNathan Huckleberry ld1 {GSTAR.2d}, [TMP] 340*9d2c0b48SNathan Huckleberry ld1 {SUM.16b}, [ACCUMULATOR] 341*9d2c0b48SNathan Huckleberry subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 342*9d2c0b48SNathan Huckleberry blt .LstrideLoopExit 343*9d2c0b48SNathan Huckleberry ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 344*9d2c0b48SNathan Huckleberry ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 345*9d2c0b48SNathan Huckleberry full_stride 0 346*9d2c0b48SNathan Huckleberry subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 347*9d2c0b48SNathan Huckleberry blt .LstrideLoopExitReduce 348*9d2c0b48SNathan Huckleberry.LstrideLoop: 349*9d2c0b48SNathan Huckleberry full_stride 1 350*9d2c0b48SNathan Huckleberry subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 351*9d2c0b48SNathan Huckleberry bge .LstrideLoop 352*9d2c0b48SNathan Huckleberry.LstrideLoopExitReduce: 353*9d2c0b48SNathan Huckleberry montgomery_reduction SUM 354*9d2c0b48SNathan Huckleberry.LstrideLoopExit: 355*9d2c0b48SNathan Huckleberry adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 356*9d2c0b48SNathan Huckleberry beq .LskipPartial 357*9d2c0b48SNathan Huckleberry partial_stride 358*9d2c0b48SNathan Huckleberry.LskipPartial: 359*9d2c0b48SNathan Huckleberry st1 {SUM.16b}, [ACCUMULATOR] 360*9d2c0b48SNathan Huckleberry ret 361*9d2c0b48SNathan HuckleberrySYM_FUNC_END(pmull_polyval_update) 362