xref: /openbmc/linux/arch/arm64/crypto/polyval-ce-core.S (revision 4f2c0a4acffbec01079c28f839422e64ddeff004)
1*9d2c0b48SNathan Huckleberry/* SPDX-License-Identifier: GPL-2.0 */
2*9d2c0b48SNathan Huckleberry/*
3*9d2c0b48SNathan Huckleberry * Implementation of POLYVAL using ARMv8 Crypto Extensions.
4*9d2c0b48SNathan Huckleberry *
5*9d2c0b48SNathan Huckleberry * Copyright 2021 Google LLC
6*9d2c0b48SNathan Huckleberry */
7*9d2c0b48SNathan Huckleberry/*
8*9d2c0b48SNathan Huckleberry * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
9*9d2c0b48SNathan Huckleberry * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
10*9d2c0b48SNathan Huckleberry * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
11*9d2c0b48SNathan Huckleberry * finite field multiplication into two steps.
12*9d2c0b48SNathan Huckleberry *
13*9d2c0b48SNathan Huckleberry * In the first step, we consider h^i, m_i as normal polynomials of degree less
14*9d2c0b48SNathan Huckleberry * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
15*9d2c0b48SNathan Huckleberry * is simply polynomial multiplication.
16*9d2c0b48SNathan Huckleberry *
17*9d2c0b48SNathan Huckleberry * In the second step, we compute the reduction of p(x) modulo the finite field
18*9d2c0b48SNathan Huckleberry * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
19*9d2c0b48SNathan Huckleberry *
20*9d2c0b48SNathan Huckleberry * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
21*9d2c0b48SNathan Huckleberry * multiplication is finite field multiplication. The advantage is that the
22*9d2c0b48SNathan Huckleberry * two-step process  only requires 1 finite field reduction for every 8
23*9d2c0b48SNathan Huckleberry * polynomial multiplications. Further parallelism is gained by interleaving the
24*9d2c0b48SNathan Huckleberry * multiplications and polynomial reductions.
25*9d2c0b48SNathan Huckleberry */
26*9d2c0b48SNathan Huckleberry
27*9d2c0b48SNathan Huckleberry#include <linux/linkage.h>
28*9d2c0b48SNathan Huckleberry#define STRIDE_BLOCKS 8
29*9d2c0b48SNathan Huckleberry
30*9d2c0b48SNathan HuckleberryKEY_POWERS	.req	x0
31*9d2c0b48SNathan HuckleberryMSG		.req	x1
32*9d2c0b48SNathan HuckleberryBLOCKS_LEFT	.req	x2
33*9d2c0b48SNathan HuckleberryACCUMULATOR	.req	x3
34*9d2c0b48SNathan HuckleberryKEY_START	.req	x10
35*9d2c0b48SNathan HuckleberryEXTRA_BYTES	.req	x11
36*9d2c0b48SNathan HuckleberryTMP	.req	x13
37*9d2c0b48SNathan Huckleberry
38*9d2c0b48SNathan HuckleberryM0	.req	v0
39*9d2c0b48SNathan HuckleberryM1	.req	v1
40*9d2c0b48SNathan HuckleberryM2	.req	v2
41*9d2c0b48SNathan HuckleberryM3	.req	v3
42*9d2c0b48SNathan HuckleberryM4	.req	v4
43*9d2c0b48SNathan HuckleberryM5	.req	v5
44*9d2c0b48SNathan HuckleberryM6	.req	v6
45*9d2c0b48SNathan HuckleberryM7	.req	v7
46*9d2c0b48SNathan HuckleberryKEY8	.req	v8
47*9d2c0b48SNathan HuckleberryKEY7	.req	v9
48*9d2c0b48SNathan HuckleberryKEY6	.req	v10
49*9d2c0b48SNathan HuckleberryKEY5	.req	v11
50*9d2c0b48SNathan HuckleberryKEY4	.req	v12
51*9d2c0b48SNathan HuckleberryKEY3	.req	v13
52*9d2c0b48SNathan HuckleberryKEY2	.req	v14
53*9d2c0b48SNathan HuckleberryKEY1	.req	v15
54*9d2c0b48SNathan HuckleberryPL	.req	v16
55*9d2c0b48SNathan HuckleberryPH	.req	v17
56*9d2c0b48SNathan HuckleberryTMP_V	.req	v18
57*9d2c0b48SNathan HuckleberryLO	.req	v20
58*9d2c0b48SNathan HuckleberryMI	.req	v21
59*9d2c0b48SNathan HuckleberryHI	.req	v22
60*9d2c0b48SNathan HuckleberrySUM	.req	v23
61*9d2c0b48SNathan HuckleberryGSTAR	.req	v24
62*9d2c0b48SNathan Huckleberry
63*9d2c0b48SNathan Huckleberry	.text
64*9d2c0b48SNathan Huckleberry
65*9d2c0b48SNathan Huckleberry	.arch	armv8-a+crypto
66*9d2c0b48SNathan Huckleberry	.align	4
67*9d2c0b48SNathan Huckleberry
68*9d2c0b48SNathan Huckleberry.Lgstar:
69*9d2c0b48SNathan Huckleberry	.quad	0xc200000000000000, 0xc200000000000000
70*9d2c0b48SNathan Huckleberry
71*9d2c0b48SNathan Huckleberry/*
72*9d2c0b48SNathan Huckleberry * Computes the product of two 128-bit polynomials in X and Y and XORs the
73*9d2c0b48SNathan Huckleberry * components of the 256-bit product into LO, MI, HI.
74*9d2c0b48SNathan Huckleberry *
75*9d2c0b48SNathan Huckleberry * Given:
76*9d2c0b48SNathan Huckleberry *  X = [X_1 : X_0]
77*9d2c0b48SNathan Huckleberry *  Y = [Y_1 : Y_0]
78*9d2c0b48SNathan Huckleberry *
79*9d2c0b48SNathan Huckleberry * We compute:
80*9d2c0b48SNathan Huckleberry *  LO += X_0 * Y_0
81*9d2c0b48SNathan Huckleberry *  MI += (X_0 + X_1) * (Y_0 + Y_1)
82*9d2c0b48SNathan Huckleberry *  HI += X_1 * Y_1
83*9d2c0b48SNathan Huckleberry *
84*9d2c0b48SNathan Huckleberry * Later, the 256-bit result can be extracted as:
85*9d2c0b48SNathan Huckleberry *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
86*9d2c0b48SNathan Huckleberry * This step is done when computing the polynomial reduction for efficiency
87*9d2c0b48SNathan Huckleberry * reasons.
88*9d2c0b48SNathan Huckleberry *
89*9d2c0b48SNathan Huckleberry * Karatsuba multiplication is used instead of Schoolbook multiplication because
90*9d2c0b48SNathan Huckleberry * it was found to be slightly faster on ARM64 CPUs.
91*9d2c0b48SNathan Huckleberry *
92*9d2c0b48SNathan Huckleberry */
93*9d2c0b48SNathan Huckleberry.macro karatsuba1 X Y
94*9d2c0b48SNathan Huckleberry	X .req \X
95*9d2c0b48SNathan Huckleberry	Y .req \Y
96*9d2c0b48SNathan Huckleberry	ext	v25.16b, X.16b, X.16b, #8
97*9d2c0b48SNathan Huckleberry	ext	v26.16b, Y.16b, Y.16b, #8
98*9d2c0b48SNathan Huckleberry	eor	v25.16b, v25.16b, X.16b
99*9d2c0b48SNathan Huckleberry	eor	v26.16b, v26.16b, Y.16b
100*9d2c0b48SNathan Huckleberry	pmull2	v28.1q, X.2d, Y.2d
101*9d2c0b48SNathan Huckleberry	pmull	v29.1q, X.1d, Y.1d
102*9d2c0b48SNathan Huckleberry	pmull	v27.1q, v25.1d, v26.1d
103*9d2c0b48SNathan Huckleberry	eor	HI.16b, HI.16b, v28.16b
104*9d2c0b48SNathan Huckleberry	eor	LO.16b, LO.16b, v29.16b
105*9d2c0b48SNathan Huckleberry	eor	MI.16b, MI.16b, v27.16b
106*9d2c0b48SNathan Huckleberry	.unreq X
107*9d2c0b48SNathan Huckleberry	.unreq Y
108*9d2c0b48SNathan Huckleberry.endm
109*9d2c0b48SNathan Huckleberry
110*9d2c0b48SNathan Huckleberry/*
111*9d2c0b48SNathan Huckleberry * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
112*9d2c0b48SNathan Huckleberry * them.
113*9d2c0b48SNathan Huckleberry */
114*9d2c0b48SNathan Huckleberry.macro karatsuba1_store X Y
115*9d2c0b48SNathan Huckleberry	X .req \X
116*9d2c0b48SNathan Huckleberry	Y .req \Y
117*9d2c0b48SNathan Huckleberry	ext	v25.16b, X.16b, X.16b, #8
118*9d2c0b48SNathan Huckleberry	ext	v26.16b, Y.16b, Y.16b, #8
119*9d2c0b48SNathan Huckleberry	eor	v25.16b, v25.16b, X.16b
120*9d2c0b48SNathan Huckleberry	eor	v26.16b, v26.16b, Y.16b
121*9d2c0b48SNathan Huckleberry	pmull2	HI.1q, X.2d, Y.2d
122*9d2c0b48SNathan Huckleberry	pmull	LO.1q, X.1d, Y.1d
123*9d2c0b48SNathan Huckleberry	pmull	MI.1q, v25.1d, v26.1d
124*9d2c0b48SNathan Huckleberry	.unreq X
125*9d2c0b48SNathan Huckleberry	.unreq Y
126*9d2c0b48SNathan Huckleberry.endm
127*9d2c0b48SNathan Huckleberry
128*9d2c0b48SNathan Huckleberry/*
129*9d2c0b48SNathan Huckleberry * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
130*9d2c0b48SNathan Huckleberry * the result in PL, PH.
131*9d2c0b48SNathan Huckleberry * [PH : PL] =
132*9d2c0b48SNathan Huckleberry *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
133*9d2c0b48SNathan Huckleberry */
134*9d2c0b48SNathan Huckleberry.macro karatsuba2
135*9d2c0b48SNathan Huckleberry	// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
136*9d2c0b48SNathan Huckleberry	eor	v4.16b, HI.16b, MI.16b
137*9d2c0b48SNathan Huckleberry	// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
138*9d2c0b48SNathan Huckleberry	eor	v4.16b, v4.16b, LO.16b
139*9d2c0b48SNathan Huckleberry	// v5 = [HI_0 : LO_1]
140*9d2c0b48SNathan Huckleberry	ext	v5.16b, LO.16b, HI.16b, #8
141*9d2c0b48SNathan Huckleberry	// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
142*9d2c0b48SNathan Huckleberry	eor	v4.16b, v4.16b, v5.16b
143*9d2c0b48SNathan Huckleberry	// HI = [HI_0 : HI_1]
144*9d2c0b48SNathan Huckleberry	ext	HI.16b, HI.16b, HI.16b, #8
145*9d2c0b48SNathan Huckleberry	// LO = [LO_0 : LO_1]
146*9d2c0b48SNathan Huckleberry	ext	LO.16b, LO.16b, LO.16b, #8
147*9d2c0b48SNathan Huckleberry	// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
148*9d2c0b48SNathan Huckleberry	ext	PH.16b, v4.16b, HI.16b, #8
149*9d2c0b48SNathan Huckleberry	// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
150*9d2c0b48SNathan Huckleberry	ext	PL.16b, LO.16b, v4.16b, #8
151*9d2c0b48SNathan Huckleberry.endm
152*9d2c0b48SNathan Huckleberry
153*9d2c0b48SNathan Huckleberry/*
154*9d2c0b48SNathan Huckleberry * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
155*9d2c0b48SNathan Huckleberry *
156*9d2c0b48SNathan Huckleberry * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
157*9d2c0b48SNathan Huckleberry * x^128 + x^127 + x^126 + x^121 + 1.
158*9d2c0b48SNathan Huckleberry *
159*9d2c0b48SNathan Huckleberry * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
160*9d2c0b48SNathan Huckleberry * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
161*9d2c0b48SNathan Huckleberry * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
162*9d2c0b48SNathan Huckleberry * of x^128, this product has two extra factors of x^128.  To get it back into
163*9d2c0b48SNathan Huckleberry * Montgomery form, we need to remove one of these factors by dividing by x^128.
164*9d2c0b48SNathan Huckleberry *
165*9d2c0b48SNathan Huckleberry * To accomplish both of these goals, we add multiples of g(x) that cancel out
166*9d2c0b48SNathan Huckleberry * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
167*9d2c0b48SNathan Huckleberry * bits are zero, the polynomial division by x^128 can be done by right
168*9d2c0b48SNathan Huckleberry * shifting.
169*9d2c0b48SNathan Huckleberry *
170*9d2c0b48SNathan Huckleberry * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
171*9d2c0b48SNathan Huckleberry * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
172*9d2c0b48SNathan Huckleberry * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
173*9d2c0b48SNathan Huckleberry * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
174*9d2c0b48SNathan Huckleberry * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
175*9d2c0b48SNathan Huckleberry * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
176*9d2c0b48SNathan Huckleberry *
177*9d2c0b48SNathan Huckleberry * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
178*9d2c0b48SNathan Huckleberry * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
179*9d2c0b48SNathan Huckleberry * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
180*9d2c0b48SNathan Huckleberry * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
181*9d2c0b48SNathan Huckleberry * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
182*9d2c0b48SNathan Huckleberry *
183*9d2c0b48SNathan Huckleberry * So our final computation is:
184*9d2c0b48SNathan Huckleberry *   T = T_1 : T_0 = g*(x) * P_0
185*9d2c0b48SNathan Huckleberry *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
186*9d2c0b48SNathan Huckleberry *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
187*9d2c0b48SNathan Huckleberry *
188*9d2c0b48SNathan Huckleberry * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
189*9d2c0b48SNathan Huckleberry * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
190*9d2c0b48SNathan Huckleberry * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
191*9d2c0b48SNathan Huckleberry */
192*9d2c0b48SNathan Huckleberry.macro montgomery_reduction dest
193*9d2c0b48SNathan Huckleberry	DEST .req \dest
194*9d2c0b48SNathan Huckleberry	// TMP_V = T_1 : T_0 = P_0 * g*(x)
195*9d2c0b48SNathan Huckleberry	pmull	TMP_V.1q, PL.1d, GSTAR.1d
196*9d2c0b48SNathan Huckleberry	// TMP_V = T_0 : T_1
197*9d2c0b48SNathan Huckleberry	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
198*9d2c0b48SNathan Huckleberry	// TMP_V = P_1 + T_0 : P_0 + T_1
199*9d2c0b48SNathan Huckleberry	eor	TMP_V.16b, PL.16b, TMP_V.16b
200*9d2c0b48SNathan Huckleberry	// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
201*9d2c0b48SNathan Huckleberry	eor	PH.16b, PH.16b, TMP_V.16b
202*9d2c0b48SNathan Huckleberry	// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
203*9d2c0b48SNathan Huckleberry	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
204*9d2c0b48SNathan Huckleberry	eor	DEST.16b, PH.16b, TMP_V.16b
205*9d2c0b48SNathan Huckleberry	.unreq DEST
206*9d2c0b48SNathan Huckleberry.endm
207*9d2c0b48SNathan Huckleberry
208*9d2c0b48SNathan Huckleberry/*
209*9d2c0b48SNathan Huckleberry * Compute Polyval on 8 blocks.
210*9d2c0b48SNathan Huckleberry *
211*9d2c0b48SNathan Huckleberry * If reduce is set, also computes the montgomery reduction of the
212*9d2c0b48SNathan Huckleberry * previous full_stride call and XORs with the first message block.
213*9d2c0b48SNathan Huckleberry * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
214*9d2c0b48SNathan Huckleberry * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
215*9d2c0b48SNathan Huckleberry *
216*9d2c0b48SNathan Huckleberry * Sets PL, PH.
217*9d2c0b48SNathan Huckleberry */
218*9d2c0b48SNathan Huckleberry.macro full_stride reduce
219*9d2c0b48SNathan Huckleberry	eor		LO.16b, LO.16b, LO.16b
220*9d2c0b48SNathan Huckleberry	eor		MI.16b, MI.16b, MI.16b
221*9d2c0b48SNathan Huckleberry	eor		HI.16b, HI.16b, HI.16b
222*9d2c0b48SNathan Huckleberry
223*9d2c0b48SNathan Huckleberry	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
224*9d2c0b48SNathan Huckleberry	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
225*9d2c0b48SNathan Huckleberry
226*9d2c0b48SNathan Huckleberry	karatsuba1 M7 KEY1
227*9d2c0b48SNathan Huckleberry	.if \reduce
228*9d2c0b48SNathan Huckleberry	pmull	TMP_V.1q, PL.1d, GSTAR.1d
229*9d2c0b48SNathan Huckleberry	.endif
230*9d2c0b48SNathan Huckleberry
231*9d2c0b48SNathan Huckleberry	karatsuba1 M6 KEY2
232*9d2c0b48SNathan Huckleberry	.if \reduce
233*9d2c0b48SNathan Huckleberry	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
234*9d2c0b48SNathan Huckleberry	.endif
235*9d2c0b48SNathan Huckleberry
236*9d2c0b48SNathan Huckleberry	karatsuba1 M5 KEY3
237*9d2c0b48SNathan Huckleberry	.if \reduce
238*9d2c0b48SNathan Huckleberry	eor	TMP_V.16b, PL.16b, TMP_V.16b
239*9d2c0b48SNathan Huckleberry	.endif
240*9d2c0b48SNathan Huckleberry
241*9d2c0b48SNathan Huckleberry	karatsuba1 M4 KEY4
242*9d2c0b48SNathan Huckleberry	.if \reduce
243*9d2c0b48SNathan Huckleberry	eor	PH.16b, PH.16b, TMP_V.16b
244*9d2c0b48SNathan Huckleberry	.endif
245*9d2c0b48SNathan Huckleberry
246*9d2c0b48SNathan Huckleberry	karatsuba1 M3 KEY5
247*9d2c0b48SNathan Huckleberry	.if \reduce
248*9d2c0b48SNathan Huckleberry	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
249*9d2c0b48SNathan Huckleberry	.endif
250*9d2c0b48SNathan Huckleberry
251*9d2c0b48SNathan Huckleberry	karatsuba1 M2 KEY6
252*9d2c0b48SNathan Huckleberry	.if \reduce
253*9d2c0b48SNathan Huckleberry	eor	SUM.16b, PH.16b, TMP_V.16b
254*9d2c0b48SNathan Huckleberry	.endif
255*9d2c0b48SNathan Huckleberry
256*9d2c0b48SNathan Huckleberry	karatsuba1 M1 KEY7
257*9d2c0b48SNathan Huckleberry	eor	M0.16b, M0.16b, SUM.16b
258*9d2c0b48SNathan Huckleberry
259*9d2c0b48SNathan Huckleberry	karatsuba1 M0 KEY8
260*9d2c0b48SNathan Huckleberry	karatsuba2
261*9d2c0b48SNathan Huckleberry.endm
262*9d2c0b48SNathan Huckleberry
263*9d2c0b48SNathan Huckleberry/*
264*9d2c0b48SNathan Huckleberry * Handle any extra blocks after full_stride loop.
265*9d2c0b48SNathan Huckleberry */
266*9d2c0b48SNathan Huckleberry.macro partial_stride
267*9d2c0b48SNathan Huckleberry	add	KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
268*9d2c0b48SNathan Huckleberry	sub	KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
269*9d2c0b48SNathan Huckleberry	ld1	{KEY1.16b}, [KEY_POWERS], #16
270*9d2c0b48SNathan Huckleberry
271*9d2c0b48SNathan Huckleberry	ld1	{TMP_V.16b}, [MSG], #16
272*9d2c0b48SNathan Huckleberry	eor	SUM.16b, SUM.16b, TMP_V.16b
273*9d2c0b48SNathan Huckleberry	karatsuba1_store KEY1 SUM
274*9d2c0b48SNathan Huckleberry	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
275*9d2c0b48SNathan Huckleberry
276*9d2c0b48SNathan Huckleberry	tst	BLOCKS_LEFT, #4
277*9d2c0b48SNathan Huckleberry	beq	.Lpartial4BlocksDone
278*9d2c0b48SNathan Huckleberry	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
279*9d2c0b48SNathan Huckleberry	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
280*9d2c0b48SNathan Huckleberry	karatsuba1 M0 KEY8
281*9d2c0b48SNathan Huckleberry	karatsuba1 M1 KEY7
282*9d2c0b48SNathan Huckleberry	karatsuba1 M2 KEY6
283*9d2c0b48SNathan Huckleberry	karatsuba1 M3 KEY5
284*9d2c0b48SNathan Huckleberry.Lpartial4BlocksDone:
285*9d2c0b48SNathan Huckleberry	tst	BLOCKS_LEFT, #2
286*9d2c0b48SNathan Huckleberry	beq	.Lpartial2BlocksDone
287*9d2c0b48SNathan Huckleberry	ld1	{M0.16b, M1.16b}, [MSG], #32
288*9d2c0b48SNathan Huckleberry	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
289*9d2c0b48SNathan Huckleberry	karatsuba1 M0 KEY8
290*9d2c0b48SNathan Huckleberry	karatsuba1 M1 KEY7
291*9d2c0b48SNathan Huckleberry.Lpartial2BlocksDone:
292*9d2c0b48SNathan Huckleberry	tst	BLOCKS_LEFT, #1
293*9d2c0b48SNathan Huckleberry	beq	.LpartialDone
294*9d2c0b48SNathan Huckleberry	ld1	{M0.16b}, [MSG], #16
295*9d2c0b48SNathan Huckleberry	ld1	{KEY8.16b}, [KEY_POWERS], #16
296*9d2c0b48SNathan Huckleberry	karatsuba1 M0 KEY8
297*9d2c0b48SNathan Huckleberry.LpartialDone:
298*9d2c0b48SNathan Huckleberry	karatsuba2
299*9d2c0b48SNathan Huckleberry	montgomery_reduction SUM
300*9d2c0b48SNathan Huckleberry.endm
301*9d2c0b48SNathan Huckleberry
302*9d2c0b48SNathan Huckleberry/*
303*9d2c0b48SNathan Huckleberry * Perform montgomery multiplication in GF(2^128) and store result in op1.
304*9d2c0b48SNathan Huckleberry *
305*9d2c0b48SNathan Huckleberry * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
306*9d2c0b48SNathan Huckleberry * If op1, op2 are in montgomery form, this computes the montgomery
307*9d2c0b48SNathan Huckleberry * form of op1*op2.
308*9d2c0b48SNathan Huckleberry *
309*9d2c0b48SNathan Huckleberry * void pmull_polyval_mul(u8 *op1, const u8 *op2);
310*9d2c0b48SNathan Huckleberry */
311*9d2c0b48SNathan HuckleberrySYM_FUNC_START(pmull_polyval_mul)
312*9d2c0b48SNathan Huckleberry	adr	TMP, .Lgstar
313*9d2c0b48SNathan Huckleberry	ld1	{GSTAR.2d}, [TMP]
314*9d2c0b48SNathan Huckleberry	ld1	{v0.16b}, [x0]
315*9d2c0b48SNathan Huckleberry	ld1	{v1.16b}, [x1]
316*9d2c0b48SNathan Huckleberry	karatsuba1_store v0 v1
317*9d2c0b48SNathan Huckleberry	karatsuba2
318*9d2c0b48SNathan Huckleberry	montgomery_reduction SUM
319*9d2c0b48SNathan Huckleberry	st1	{SUM.16b}, [x0]
320*9d2c0b48SNathan Huckleberry	ret
321*9d2c0b48SNathan HuckleberrySYM_FUNC_END(pmull_polyval_mul)
322*9d2c0b48SNathan Huckleberry
323*9d2c0b48SNathan Huckleberry/*
324*9d2c0b48SNathan Huckleberry * Perform polynomial evaluation as specified by POLYVAL.  This computes:
325*9d2c0b48SNathan Huckleberry *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
326*9d2c0b48SNathan Huckleberry * where n=nblocks, h is the hash key, and m_i are the message blocks.
327*9d2c0b48SNathan Huckleberry *
328*9d2c0b48SNathan Huckleberry * x0 - pointer to precomputed key powers h^8 ... h^1
329*9d2c0b48SNathan Huckleberry * x1 - pointer to message blocks
330*9d2c0b48SNathan Huckleberry * x2 - number of blocks to hash
331*9d2c0b48SNathan Huckleberry * x3 - pointer to accumulator
332*9d2c0b48SNathan Huckleberry *
333*9d2c0b48SNathan Huckleberry * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
334*9d2c0b48SNathan Huckleberry *			     size_t nblocks, u8 *accumulator);
335*9d2c0b48SNathan Huckleberry */
336*9d2c0b48SNathan HuckleberrySYM_FUNC_START(pmull_polyval_update)
337*9d2c0b48SNathan Huckleberry	adr	TMP, .Lgstar
338*9d2c0b48SNathan Huckleberry	mov	KEY_START, KEY_POWERS
339*9d2c0b48SNathan Huckleberry	ld1	{GSTAR.2d}, [TMP]
340*9d2c0b48SNathan Huckleberry	ld1	{SUM.16b}, [ACCUMULATOR]
341*9d2c0b48SNathan Huckleberry	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
342*9d2c0b48SNathan Huckleberry	blt .LstrideLoopExit
343*9d2c0b48SNathan Huckleberry	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
344*9d2c0b48SNathan Huckleberry	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
345*9d2c0b48SNathan Huckleberry	full_stride 0
346*9d2c0b48SNathan Huckleberry	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
347*9d2c0b48SNathan Huckleberry	blt .LstrideLoopExitReduce
348*9d2c0b48SNathan Huckleberry.LstrideLoop:
349*9d2c0b48SNathan Huckleberry	full_stride 1
350*9d2c0b48SNathan Huckleberry	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
351*9d2c0b48SNathan Huckleberry	bge	.LstrideLoop
352*9d2c0b48SNathan Huckleberry.LstrideLoopExitReduce:
353*9d2c0b48SNathan Huckleberry	montgomery_reduction SUM
354*9d2c0b48SNathan Huckleberry.LstrideLoopExit:
355*9d2c0b48SNathan Huckleberry	adds	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
356*9d2c0b48SNathan Huckleberry	beq	.LskipPartial
357*9d2c0b48SNathan Huckleberry	partial_stride
358*9d2c0b48SNathan Huckleberry.LskipPartial:
359*9d2c0b48SNathan Huckleberry	st1	{SUM.16b}, [ACCUMULATOR]
360*9d2c0b48SNathan Huckleberry	ret
361*9d2c0b48SNathan HuckleberrySYM_FUNC_END(pmull_polyval_update)
362