1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Implementation of POLYVAL using ARMv8 Crypto Extensions. 4 * 5 * Copyright 2021 Google LLC 6 */ 7/* 8 * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions 9 * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, 10 * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split 11 * finite field multiplication into two steps. 12 * 13 * In the first step, we consider h^i, m_i as normal polynomials of degree less 14 * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication 15 * is simply polynomial multiplication. 16 * 17 * In the second step, we compute the reduction of p(x) modulo the finite field 18 * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. 19 * 20 * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where 21 * multiplication is finite field multiplication. The advantage is that the 22 * two-step process only requires 1 finite field reduction for every 8 23 * polynomial multiplications. Further parallelism is gained by interleaving the 24 * multiplications and polynomial reductions. 25 */ 26 27#include <linux/linkage.h> 28#define STRIDE_BLOCKS 8 29 30KEY_POWERS .req x0 31MSG .req x1 32BLOCKS_LEFT .req x2 33ACCUMULATOR .req x3 34KEY_START .req x10 35EXTRA_BYTES .req x11 36TMP .req x13 37 38M0 .req v0 39M1 .req v1 40M2 .req v2 41M3 .req v3 42M4 .req v4 43M5 .req v5 44M6 .req v6 45M7 .req v7 46KEY8 .req v8 47KEY7 .req v9 48KEY6 .req v10 49KEY5 .req v11 50KEY4 .req v12 51KEY3 .req v13 52KEY2 .req v14 53KEY1 .req v15 54PL .req v16 55PH .req v17 56TMP_V .req v18 57LO .req v20 58MI .req v21 59HI .req v22 60SUM .req v23 61GSTAR .req v24 62 63 .text 64 65 .arch armv8-a+crypto 66 .align 4 67 68.Lgstar: 69 .quad 0xc200000000000000, 0xc200000000000000 70 71/* 72 * Computes the product of two 128-bit polynomials in X and Y and XORs the 73 * components of the 256-bit product into LO, MI, HI. 74 * 75 * Given: 76 * X = [X_1 : X_0] 77 * Y = [Y_1 : Y_0] 78 * 79 * We compute: 80 * LO += X_0 * Y_0 81 * MI += (X_0 + X_1) * (Y_0 + Y_1) 82 * HI += X_1 * Y_1 83 * 84 * Later, the 256-bit result can be extracted as: 85 * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] 86 * This step is done when computing the polynomial reduction for efficiency 87 * reasons. 88 * 89 * Karatsuba multiplication is used instead of Schoolbook multiplication because 90 * it was found to be slightly faster on ARM64 CPUs. 91 * 92 */ 93.macro karatsuba1 X Y 94 X .req \X 95 Y .req \Y 96 ext v25.16b, X.16b, X.16b, #8 97 ext v26.16b, Y.16b, Y.16b, #8 98 eor v25.16b, v25.16b, X.16b 99 eor v26.16b, v26.16b, Y.16b 100 pmull2 v28.1q, X.2d, Y.2d 101 pmull v29.1q, X.1d, Y.1d 102 pmull v27.1q, v25.1d, v26.1d 103 eor HI.16b, HI.16b, v28.16b 104 eor LO.16b, LO.16b, v29.16b 105 eor MI.16b, MI.16b, v27.16b 106 .unreq X 107 .unreq Y 108.endm 109 110/* 111 * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into 112 * them. 113 */ 114.macro karatsuba1_store X Y 115 X .req \X 116 Y .req \Y 117 ext v25.16b, X.16b, X.16b, #8 118 ext v26.16b, Y.16b, Y.16b, #8 119 eor v25.16b, v25.16b, X.16b 120 eor v26.16b, v26.16b, Y.16b 121 pmull2 HI.1q, X.2d, Y.2d 122 pmull LO.1q, X.1d, Y.1d 123 pmull MI.1q, v25.1d, v26.1d 124 .unreq X 125 .unreq Y 126.endm 127 128/* 129 * Computes the 256-bit polynomial represented by LO, HI, MI. Stores 130 * the result in PL, PH. 131 * [PH : PL] = 132 * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] 133 */ 134.macro karatsuba2 135 // v4 = [HI_1 + MI_1 : HI_0 + MI_0] 136 eor v4.16b, HI.16b, MI.16b 137 // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] 138 eor v4.16b, v4.16b, LO.16b 139 // v5 = [HI_0 : LO_1] 140 ext v5.16b, LO.16b, HI.16b, #8 141 // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] 142 eor v4.16b, v4.16b, v5.16b 143 // HI = [HI_0 : HI_1] 144 ext HI.16b, HI.16b, HI.16b, #8 145 // LO = [LO_0 : LO_1] 146 ext LO.16b, LO.16b, LO.16b, #8 147 // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] 148 ext PH.16b, v4.16b, HI.16b, #8 149 // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] 150 ext PL.16b, LO.16b, v4.16b, #8 151.endm 152 153/* 154 * Computes the 128-bit reduction of PH : PL. Stores the result in dest. 155 * 156 * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = 157 * x^128 + x^127 + x^126 + x^121 + 1. 158 * 159 * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the 160 * product of two 128-bit polynomials in Montgomery form. We need to reduce it 161 * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor 162 * of x^128, this product has two extra factors of x^128. To get it back into 163 * Montgomery form, we need to remove one of these factors by dividing by x^128. 164 * 165 * To accomplish both of these goals, we add multiples of g(x) that cancel out 166 * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low 167 * bits are zero, the polynomial division by x^128 can be done by right 168 * shifting. 169 * 170 * Since the only nonzero term in the low 64 bits of g(x) is the constant term, 171 * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can 172 * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + 173 * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to 174 * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T 175 * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. 176 * 177 * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits 178 * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 179 * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * 180 * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : 181 * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). 182 * 183 * So our final computation is: 184 * T = T_1 : T_0 = g*(x) * P_0 185 * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 186 * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 187 * 188 * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 189 * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : 190 * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. 191 */ 192.macro montgomery_reduction dest 193 DEST .req \dest 194 // TMP_V = T_1 : T_0 = P_0 * g*(x) 195 pmull TMP_V.1q, PL.1d, GSTAR.1d 196 // TMP_V = T_0 : T_1 197 ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 198 // TMP_V = P_1 + T_0 : P_0 + T_1 199 eor TMP_V.16b, PL.16b, TMP_V.16b 200 // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 201 eor PH.16b, PH.16b, TMP_V.16b 202 // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) 203 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 204 eor DEST.16b, PH.16b, TMP_V.16b 205 .unreq DEST 206.endm 207 208/* 209 * Compute Polyval on 8 blocks. 210 * 211 * If reduce is set, also computes the montgomery reduction of the 212 * previous full_stride call and XORs with the first message block. 213 * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 214 * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. 215 * 216 * Sets PL, PH. 217 */ 218.macro full_stride reduce 219 eor LO.16b, LO.16b, LO.16b 220 eor MI.16b, MI.16b, MI.16b 221 eor HI.16b, HI.16b, HI.16b 222 223 ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 224 ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 225 226 karatsuba1 M7 KEY1 227 .if \reduce 228 pmull TMP_V.1q, PL.1d, GSTAR.1d 229 .endif 230 231 karatsuba1 M6 KEY2 232 .if \reduce 233 ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 234 .endif 235 236 karatsuba1 M5 KEY3 237 .if \reduce 238 eor TMP_V.16b, PL.16b, TMP_V.16b 239 .endif 240 241 karatsuba1 M4 KEY4 242 .if \reduce 243 eor PH.16b, PH.16b, TMP_V.16b 244 .endif 245 246 karatsuba1 M3 KEY5 247 .if \reduce 248 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 249 .endif 250 251 karatsuba1 M2 KEY6 252 .if \reduce 253 eor SUM.16b, PH.16b, TMP_V.16b 254 .endif 255 256 karatsuba1 M1 KEY7 257 eor M0.16b, M0.16b, SUM.16b 258 259 karatsuba1 M0 KEY8 260 karatsuba2 261.endm 262 263/* 264 * Handle any extra blocks after full_stride loop. 265 */ 266.macro partial_stride 267 add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) 268 sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 269 ld1 {KEY1.16b}, [KEY_POWERS], #16 270 271 ld1 {TMP_V.16b}, [MSG], #16 272 eor SUM.16b, SUM.16b, TMP_V.16b 273 karatsuba1_store KEY1 SUM 274 sub BLOCKS_LEFT, BLOCKS_LEFT, #1 275 276 tst BLOCKS_LEFT, #4 277 beq .Lpartial4BlocksDone 278 ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 279 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 280 karatsuba1 M0 KEY8 281 karatsuba1 M1 KEY7 282 karatsuba1 M2 KEY6 283 karatsuba1 M3 KEY5 284.Lpartial4BlocksDone: 285 tst BLOCKS_LEFT, #2 286 beq .Lpartial2BlocksDone 287 ld1 {M0.16b, M1.16b}, [MSG], #32 288 ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 289 karatsuba1 M0 KEY8 290 karatsuba1 M1 KEY7 291.Lpartial2BlocksDone: 292 tst BLOCKS_LEFT, #1 293 beq .LpartialDone 294 ld1 {M0.16b}, [MSG], #16 295 ld1 {KEY8.16b}, [KEY_POWERS], #16 296 karatsuba1 M0 KEY8 297.LpartialDone: 298 karatsuba2 299 montgomery_reduction SUM 300.endm 301 302/* 303 * Perform montgomery multiplication in GF(2^128) and store result in op1. 304 * 305 * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 306 * If op1, op2 are in montgomery form, this computes the montgomery 307 * form of op1*op2. 308 * 309 * void pmull_polyval_mul(u8 *op1, const u8 *op2); 310 */ 311SYM_FUNC_START(pmull_polyval_mul) 312 adr TMP, .Lgstar 313 ld1 {GSTAR.2d}, [TMP] 314 ld1 {v0.16b}, [x0] 315 ld1 {v1.16b}, [x1] 316 karatsuba1_store v0 v1 317 karatsuba2 318 montgomery_reduction SUM 319 st1 {SUM.16b}, [x0] 320 ret 321SYM_FUNC_END(pmull_polyval_mul) 322 323/* 324 * Perform polynomial evaluation as specified by POLYVAL. This computes: 325 * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} 326 * where n=nblocks, h is the hash key, and m_i are the message blocks. 327 * 328 * x0 - pointer to precomputed key powers h^8 ... h^1 329 * x1 - pointer to message blocks 330 * x2 - number of blocks to hash 331 * x3 - pointer to accumulator 332 * 333 * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, 334 * size_t nblocks, u8 *accumulator); 335 */ 336SYM_FUNC_START(pmull_polyval_update) 337 adr TMP, .Lgstar 338 mov KEY_START, KEY_POWERS 339 ld1 {GSTAR.2d}, [TMP] 340 ld1 {SUM.16b}, [ACCUMULATOR] 341 subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 342 blt .LstrideLoopExit 343 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 344 ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 345 full_stride 0 346 subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 347 blt .LstrideLoopExitReduce 348.LstrideLoop: 349 full_stride 1 350 subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 351 bge .LstrideLoop 352.LstrideLoopExitReduce: 353 montgomery_reduction SUM 354.LstrideLoopExit: 355 adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS 356 beq .LskipPartial 357 partial_stride 358.LskipPartial: 359 st1 {SUM.16b}, [ACCUMULATOR] 360 ret 361SYM_FUNC_END(pmull_polyval_update) 362