1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Copyright 2021 Google LLC 4 */ 5/* 6 * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI 7 * instructions. It works on 8 blocks at a time, by precomputing the first 8 8 * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation 9 * allows us to split finite field multiplication into two steps. 10 * 11 * In the first step, we consider h^i, m_i as normal polynomials of degree less 12 * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication 13 * is simply polynomial multiplication. 14 * 15 * In the second step, we compute the reduction of p(x) modulo the finite field 16 * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. 17 * 18 * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where 19 * multiplication is finite field multiplication. The advantage is that the 20 * two-step process only requires 1 finite field reduction for every 8 21 * polynomial multiplications. Further parallelism is gained by interleaving the 22 * multiplications and polynomial reductions. 23 */ 24 25#include <linux/linkage.h> 26#include <asm/frame.h> 27 28#define STRIDE_BLOCKS 8 29 30#define GSTAR %xmm7 31#define PL %xmm8 32#define PH %xmm9 33#define TMP_XMM %xmm11 34#define LO %xmm12 35#define HI %xmm13 36#define MI %xmm14 37#define SUM %xmm15 38 39#define KEY_POWERS %rdi 40#define MSG %rsi 41#define BLOCKS_LEFT %rdx 42#define ACCUMULATOR %rcx 43#define TMP %rax 44 45.section .rodata.cst16.gstar, "aM", @progbits, 16 46.align 16 47 48.Lgstar: 49 .quad 0xc200000000000000, 0xc200000000000000 50 51.text 52 53/* 54 * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length 55 * count pointed to by MSG and KEY_POWERS. 56 */ 57.macro schoolbook1 count 58 .set i, 0 59 .rept (\count) 60 schoolbook1_iteration i 0 61 .set i, (i +1) 62 .endr 63.endm 64 65/* 66 * Computes the product of two 128-bit polynomials at the memory locations 67 * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of 68 * the 256-bit product into LO, MI, HI. 69 * 70 * Given: 71 * X = [X_1 : X_0] 72 * Y = [Y_1 : Y_0] 73 * 74 * We compute: 75 * LO += X_0 * Y_0 76 * MI += X_0 * Y_1 + X_1 * Y_0 77 * HI += X_1 * Y_1 78 * 79 * Later, the 256-bit result can be extracted as: 80 * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] 81 * This step is done when computing the polynomial reduction for efficiency 82 * reasons. 83 * 84 * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an 85 * extra multiplication of SUM and h^8. 86 */ 87.macro schoolbook1_iteration i xor_sum 88 movups (16*\i)(MSG), %xmm0 89 .if (\i == 0 && \xor_sum == 1) 90 pxor SUM, %xmm0 91 .endif 92 vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 93 vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 94 vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 95 vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 96 vpxor %xmm2, MI, MI 97 vpxor %xmm1, LO, LO 98 vpxor %xmm4, HI, HI 99 vpxor %xmm3, MI, MI 100.endm 101 102/* 103 * Performs the same computation as schoolbook1_iteration, except we expect the 104 * arguments to already be loaded into xmm0 and xmm1 and we set the result 105 * registers LO, MI, and HI directly rather than XOR'ing into them. 106 */ 107.macro schoolbook1_noload 108 vpclmulqdq $0x01, %xmm0, %xmm1, MI 109 vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 110 vpclmulqdq $0x00, %xmm0, %xmm1, LO 111 vpclmulqdq $0x11, %xmm0, %xmm1, HI 112 vpxor %xmm2, MI, MI 113.endm 114 115/* 116 * Computes the 256-bit polynomial represented by LO, HI, MI. Stores 117 * the result in PL, PH. 118 * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] 119 */ 120.macro schoolbook2 121 vpslldq $8, MI, PL 122 vpsrldq $8, MI, PH 123 pxor LO, PL 124 pxor HI, PH 125.endm 126 127/* 128 * Computes the 128-bit reduction of PH : PL. Stores the result in dest. 129 * 130 * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = 131 * x^128 + x^127 + x^126 + x^121 + 1. 132 * 133 * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the 134 * product of two 128-bit polynomials in Montgomery form. We need to reduce it 135 * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor 136 * of x^128, this product has two extra factors of x^128. To get it back into 137 * Montgomery form, we need to remove one of these factors by dividing by x^128. 138 * 139 * To accomplish both of these goals, we add multiples of g(x) that cancel out 140 * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low 141 * bits are zero, the polynomial division by x^128 can be done by right shifting. 142 * 143 * Since the only nonzero term in the low 64 bits of g(x) is the constant term, 144 * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can 145 * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + 146 * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to 147 * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T 148 * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. 149 * 150 * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits 151 * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 152 * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * 153 * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : 154 * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). 155 * 156 * So our final computation is: 157 * T = T_1 : T_0 = g*(x) * P_0 158 * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 159 * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 160 * 161 * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 162 * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : 163 * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. 164 */ 165.macro montgomery_reduction dest 166 vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) 167 pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 168 pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 169 pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 170 pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] 171 vpxor TMP_XMM, PH, \dest 172.endm 173 174/* 175 * Compute schoolbook multiplication for 8 blocks 176 * m_0h^8 + ... + m_7h^1 177 * 178 * If reduce is set, also computes the montgomery reduction of the 179 * previous full_stride call and XORs with the first message block. 180 * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 181 * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. 182 */ 183.macro full_stride reduce 184 pxor LO, LO 185 pxor HI, HI 186 pxor MI, MI 187 188 schoolbook1_iteration 7 0 189 .if \reduce 190 vpclmulqdq $0x00, PL, GSTAR, TMP_XMM 191 .endif 192 193 schoolbook1_iteration 6 0 194 .if \reduce 195 pshufd $0b01001110, TMP_XMM, TMP_XMM 196 .endif 197 198 schoolbook1_iteration 5 0 199 .if \reduce 200 pxor PL, TMP_XMM 201 .endif 202 203 schoolbook1_iteration 4 0 204 .if \reduce 205 pxor TMP_XMM, PH 206 .endif 207 208 schoolbook1_iteration 3 0 209 .if \reduce 210 pclmulqdq $0x11, GSTAR, TMP_XMM 211 .endif 212 213 schoolbook1_iteration 2 0 214 .if \reduce 215 vpxor TMP_XMM, PH, SUM 216 .endif 217 218 schoolbook1_iteration 1 0 219 220 schoolbook1_iteration 0 1 221 222 addq $(8*16), MSG 223 schoolbook2 224.endm 225 226/* 227 * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS 228 */ 229.macro partial_stride 230 mov BLOCKS_LEFT, TMP 231 shlq $4, TMP 232 addq $(16*STRIDE_BLOCKS), KEY_POWERS 233 subq TMP, KEY_POWERS 234 235 movups (MSG), %xmm0 236 pxor SUM, %xmm0 237 movaps (KEY_POWERS), %xmm1 238 schoolbook1_noload 239 dec BLOCKS_LEFT 240 addq $16, MSG 241 addq $16, KEY_POWERS 242 243 test $4, BLOCKS_LEFT 244 jz .Lpartial4BlocksDone 245 schoolbook1 4 246 addq $(4*16), MSG 247 addq $(4*16), KEY_POWERS 248.Lpartial4BlocksDone: 249 test $2, BLOCKS_LEFT 250 jz .Lpartial2BlocksDone 251 schoolbook1 2 252 addq $(2*16), MSG 253 addq $(2*16), KEY_POWERS 254.Lpartial2BlocksDone: 255 test $1, BLOCKS_LEFT 256 jz .LpartialDone 257 schoolbook1 1 258.LpartialDone: 259 schoolbook2 260 montgomery_reduction SUM 261.endm 262 263/* 264 * Perform montgomery multiplication in GF(2^128) and store result in op1. 265 * 266 * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 267 * If op1, op2 are in montgomery form, this computes the montgomery 268 * form of op1*op2. 269 * 270 * void clmul_polyval_mul(u8 *op1, const u8 *op2); 271 */ 272SYM_FUNC_START(clmul_polyval_mul) 273 FRAME_BEGIN 274 vmovdqa .Lgstar(%rip), GSTAR 275 movups (%rdi), %xmm0 276 movups (%rsi), %xmm1 277 schoolbook1_noload 278 schoolbook2 279 montgomery_reduction SUM 280 movups SUM, (%rdi) 281 FRAME_END 282 RET 283SYM_FUNC_END(clmul_polyval_mul) 284 285/* 286 * Perform polynomial evaluation as specified by POLYVAL. This computes: 287 * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} 288 * where n=nblocks, h is the hash key, and m_i are the message blocks. 289 * 290 * rdi - pointer to precomputed key powers h^8 ... h^1 291 * rsi - pointer to message blocks 292 * rdx - number of blocks to hash 293 * rcx - pointer to the accumulator 294 * 295 * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, 296 * const u8 *in, size_t nblocks, u8 *accumulator); 297 */ 298SYM_FUNC_START(clmul_polyval_update) 299 FRAME_BEGIN 300 vmovdqa .Lgstar(%rip), GSTAR 301 movups (ACCUMULATOR), SUM 302 subq $STRIDE_BLOCKS, BLOCKS_LEFT 303 js .LstrideLoopExit 304 full_stride 0 305 subq $STRIDE_BLOCKS, BLOCKS_LEFT 306 js .LstrideLoopExitReduce 307.LstrideLoop: 308 full_stride 1 309 subq $STRIDE_BLOCKS, BLOCKS_LEFT 310 jns .LstrideLoop 311.LstrideLoopExitReduce: 312 montgomery_reduction SUM 313.LstrideLoopExit: 314 add $STRIDE_BLOCKS, BLOCKS_LEFT 315 jz .LskipPartial 316 partial_stride 317.LskipPartial: 318 movups SUM, (ACCUMULATOR) 319 FRAME_END 320 RET 321SYM_FUNC_END(clmul_polyval_update) 322