xref: /openbmc/linux/arch/arm64/crypto/sm4-ce-gcm-core.S (revision ea68a3e9d14e9e0bf017d178fb4bd53b6deb1482)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4  * as specified in rfc8998
5  * https://datatracker.ietf.org/doc/html/rfc8998
6  *
7  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10 
11 #include <linux/linkage.h>
12 #include <linux/cfi_types.h>
13 #include <asm/assembler.h>
14 #include "sm4-ce-asm.h"
15 
16 .arch	armv8-a+crypto
17 
18 .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
19 	.set .Lv\b\().4s, \b
20 .endr
21 
22 .macro sm4e, vd, vn
23 	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24 .endm
25 
26 /* Register macros */
27 
28 /* Used for both encryption and decryption */
29 #define	RHASH	v21
30 #define	RRCONST	v22
31 #define RZERO	v23
32 
33 /* Helper macros. */
34 
35 /*
36  * input: m0, m1
37  * output: r0:r1 (low 128-bits in r0, high in r1)
38  */
39 #define PMUL_128x128(r0, r1, m0, m1, T0, T1)			\
40 		ext		T0.16b, m1.16b, m1.16b, #8;	\
41 		pmull		r0.1q, m0.1d, m1.1d;		\
42 		pmull		T1.1q, m0.1d, T0.1d;		\
43 		pmull2		T0.1q, m0.2d, T0.2d;		\
44 		pmull2		r1.1q, m0.2d, m1.2d;		\
45 		eor		T0.16b, T0.16b, T1.16b;		\
46 		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
47 		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
48 		eor		r0.16b, r0.16b, T1.16b;		\
49 		eor		r1.16b, r1.16b, T0.16b;
50 
51 #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,			\
52 			r2, r3, m2, m3, T2, T3,			\
53 			r4, r5, m4, m5, T4, T5,			\
54 			r6, r7, m6, m7, T6, T7)			\
55 		ext		T0.16b, m1.16b, m1.16b, #8;	\
56 		ext		T2.16b, m3.16b, m3.16b, #8;	\
57 		ext		T4.16b, m5.16b, m5.16b, #8;	\
58 		ext		T6.16b, m7.16b, m7.16b, #8;	\
59 		pmull		r0.1q, m0.1d, m1.1d;		\
60 		pmull		r2.1q, m2.1d, m3.1d;		\
61 		pmull		r4.1q, m4.1d, m5.1d;		\
62 		pmull		r6.1q, m6.1d, m7.1d;		\
63 		pmull		T1.1q, m0.1d, T0.1d;		\
64 		pmull		T3.1q, m2.1d, T2.1d;		\
65 		pmull		T5.1q, m4.1d, T4.1d;		\
66 		pmull		T7.1q, m6.1d, T6.1d;		\
67 		pmull2		T0.1q, m0.2d, T0.2d;		\
68 		pmull2		T2.1q, m2.2d, T2.2d;		\
69 		pmull2		T4.1q, m4.2d, T4.2d;		\
70 		pmull2		T6.1q, m6.2d, T6.2d;		\
71 		pmull2		r1.1q, m0.2d, m1.2d;		\
72 		pmull2		r3.1q, m2.2d, m3.2d;		\
73 		pmull2		r5.1q, m4.2d, m5.2d;		\
74 		pmull2		r7.1q, m6.2d, m7.2d;		\
75 		eor		T0.16b, T0.16b, T1.16b;		\
76 		eor		T2.16b, T2.16b, T3.16b;		\
77 		eor		T4.16b, T4.16b, T5.16b;		\
78 		eor		T6.16b, T6.16b, T7.16b;		\
79 		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
80 		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
81 		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
82 		ext		T7.16b, RZERO.16b, T6.16b, #8;	\
83 		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
84 		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
85 		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
86 		ext		T6.16b, T6.16b, RZERO.16b, #8;	\
87 		eor		r0.16b, r0.16b, T1.16b;		\
88 		eor		r2.16b, r2.16b, T3.16b; 	\
89 		eor		r4.16b, r4.16b, T5.16b; 	\
90 		eor		r6.16b, r6.16b, T7.16b; 	\
91 		eor		r1.16b, r1.16b, T0.16b; 	\
92 		eor		r3.16b, r3.16b, T2.16b; 	\
93 		eor		r5.16b, r5.16b, T4.16b; 	\
94 		eor		r7.16b, r7.16b, T6.16b;
95 
96 /*
97  * input: r0:r1 (low 128-bits in r0, high in r1)
98  * output: a
99  */
100 #define REDUCTION(a, r0, r1, rconst, T0, T1)			\
101 		pmull2		T0.1q, r1.2d, rconst.2d;	\
102 		ext		T1.16b, T0.16b, RZERO.16b, #8;	\
103 		ext		T0.16b, RZERO.16b, T0.16b, #8;	\
104 		eor		r1.16b, r1.16b, T1.16b;		\
105 		eor		r0.16b, r0.16b, T0.16b;		\
106 		pmull		T0.1q, r1.1d, rconst.1d;	\
107 		eor		a.16b, r0.16b, T0.16b;
108 
109 #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)	\
110 	rev32			b0.16b, b0.16b;			\
111 		ext		T0.16b, m1.16b, m1.16b, #8;	\
112 	sm4e			b0.4s, v24.4s;			\
113 		pmull		r0.1q, m0.1d, m1.1d;		\
114 	sm4e			b0.4s, v25.4s;			\
115 		pmull		T1.1q, m0.1d, T0.1d;		\
116 	sm4e			b0.4s, v26.4s;			\
117 		pmull2		T0.1q, m0.2d, T0.2d;		\
118 	sm4e			b0.4s, v27.4s;			\
119 		pmull2		r1.1q, m0.2d, m1.2d;		\
120 	sm4e			b0.4s, v28.4s;			\
121 		eor		T0.16b, T0.16b, T1.16b;		\
122 	sm4e			b0.4s, v29.4s;			\
123 		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
124 	sm4e			b0.4s, v30.4s;			\
125 		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
126 	sm4e			b0.4s, v31.4s;			\
127 		eor		r0.16b, r0.16b, T1.16b;		\
128 	rev64			b0.4s, b0.4s;			\
129 		eor		r1.16b, r1.16b, T0.16b;		\
130 	ext			b0.16b, b0.16b, b0.16b, #8;	\
131 	rev32			b0.16b, b0.16b;
132 
133 #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,			\
134 				    r0, r1, m0, m1, T0, T1,	\
135 				    r2, r3, m2, m3, T2, T3,	\
136 				    r4, r5, m4, m5, T4, T5)	\
137 	rev32			b0.16b, b0.16b;			\
138 	rev32			b1.16b, b1.16b;			\
139 	rev32			b2.16b, b2.16b;			\
140 		ext		T0.16b, m1.16b, m1.16b, #8;	\
141 		ext		T2.16b, m3.16b, m3.16b, #8;	\
142 		ext		T4.16b, m5.16b, m5.16b, #8;	\
143 	sm4e			b0.4s, v24.4s;			\
144 	sm4e			b1.4s, v24.4s;			\
145 	sm4e			b2.4s, v24.4s;			\
146 		pmull		r0.1q, m0.1d, m1.1d;		\
147 		pmull		r2.1q, m2.1d, m3.1d;		\
148 		pmull		r4.1q, m4.1d, m5.1d;		\
149 	sm4e			b0.4s, v25.4s;			\
150 	sm4e			b1.4s, v25.4s;			\
151 	sm4e			b2.4s, v25.4s;			\
152 		pmull		T1.1q, m0.1d, T0.1d;		\
153 		pmull		T3.1q, m2.1d, T2.1d;		\
154 		pmull		T5.1q, m4.1d, T4.1d;		\
155 	sm4e			b0.4s, v26.4s;			\
156 	sm4e			b1.4s, v26.4s;			\
157 	sm4e			b2.4s, v26.4s;			\
158 		pmull2		T0.1q, m0.2d, T0.2d;		\
159 		pmull2		T2.1q, m2.2d, T2.2d;		\
160 		pmull2		T4.1q, m4.2d, T4.2d;		\
161 	sm4e			b0.4s, v27.4s;			\
162 	sm4e			b1.4s, v27.4s;			\
163 	sm4e			b2.4s, v27.4s;			\
164 		pmull2		r1.1q, m0.2d, m1.2d;		\
165 		pmull2		r3.1q, m2.2d, m3.2d;		\
166 		pmull2		r5.1q, m4.2d, m5.2d;		\
167 	sm4e			b0.4s, v28.4s;			\
168 	sm4e			b1.4s, v28.4s;			\
169 	sm4e			b2.4s, v28.4s;			\
170 		eor		T0.16b, T0.16b, T1.16b;		\
171 		eor		T2.16b, T2.16b, T3.16b;		\
172 		eor		T4.16b, T4.16b, T5.16b;		\
173 	sm4e			b0.4s, v29.4s;			\
174 	sm4e			b1.4s, v29.4s;			\
175 	sm4e			b2.4s, v29.4s;			\
176 		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
177 		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
178 		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
179 	sm4e			b0.4s, v30.4s;			\
180 	sm4e			b1.4s, v30.4s;			\
181 	sm4e			b2.4s, v30.4s;			\
182 		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
183 		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
184 		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
185 	sm4e			b0.4s, v31.4s;			\
186 	sm4e			b1.4s, v31.4s;			\
187 	sm4e			b2.4s, v31.4s;			\
188 		eor		r0.16b, r0.16b, T1.16b;		\
189 		eor		r2.16b, r2.16b, T3.16b;		\
190 		eor		r4.16b, r4.16b, T5.16b;		\
191 	rev64			b0.4s, b0.4s;			\
192 	rev64			b1.4s, b1.4s;			\
193 	rev64			b2.4s, b2.4s;			\
194 		eor		r1.16b, r1.16b, T0.16b;		\
195 		eor		r3.16b, r3.16b, T2.16b;		\
196 		eor		r5.16b, r5.16b, T4.16b;		\
197 	ext			b0.16b, b0.16b, b0.16b, #8;	\
198 	ext			b1.16b, b1.16b, b1.16b, #8;	\
199 	ext			b2.16b, b2.16b, b2.16b, #8;	\
200 		eor		r0.16b, r0.16b, r2.16b;		\
201 		eor		r1.16b, r1.16b, r3.16b;		\
202 	rev32			b0.16b, b0.16b;			\
203 	rev32			b1.16b, b1.16b;			\
204 	rev32			b2.16b, b2.16b;			\
205 		eor		r0.16b, r0.16b, r4.16b;		\
206 		eor		r1.16b, r1.16b, r5.16b;
207 
208 #define inc32_le128(vctr)					\
209 		mov		vctr.d[1], x9;			\
210 		add		w6, w9, #1;			\
211 		mov		vctr.d[0], x8;			\
212 		bfi		x9, x6, #0, #32;		\
213 		rev64		vctr.16b, vctr.16b;
214 
215 #define GTAG_HASH_LENGTHS(vctr0, vlen)					\
216 		ld1		{vlen.16b}, [x7];			\
217 		/* construct CTR0 */					\
218 		/* the lower 32-bits of initial IV is always be32(1) */	\
219 		mov		x6, #0x1;				\
220 		bfi		x9, x6, #0, #32;			\
221 		mov		vctr0.d[0], x8;				\
222 		mov		vctr0.d[1], x9;				\
223 		rbit		vlen.16b, vlen.16b;			\
224 		rev64		vctr0.16b, vctr0.16b;			\
225 		/* authtag = GCTR(CTR0, GHASH) */			\
226 		eor		RHASH.16b, RHASH.16b, vlen.16b;		\
227 		SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1,	\
228 					   RTMP0, RTMP1);		\
229 		REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);	\
230 		rbit		RHASH.16b, RHASH.16b;			\
231 		eor		RHASH.16b, RHASH.16b, vctr0.16b;
232 
233 
234 /* Register macros for encrypt and ghash */
235 
236 /* can be the same as input v0-v3 */
237 #define	RR1	v0
238 #define	RR3	v1
239 #define	RR5	v2
240 #define	RR7	v3
241 
242 #define	RR0	v4
243 #define	RR2	v5
244 #define	RR4	v6
245 #define	RR6	v7
246 
247 #define RTMP0	v8
248 #define RTMP1	v9
249 #define RTMP2	v10
250 #define RTMP3	v11
251 #define RTMP4	v12
252 #define RTMP5	v13
253 #define RTMP6	v14
254 #define RTMP7	v15
255 
256 #define	RH1	v16
257 #define	RH2	v17
258 #define	RH3	v18
259 #define	RH4	v19
260 
261 .align 3
262 SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
263 	/* input:
264 	 *   x0: round key array, CTX
265 	 *   x1: ghash table
266 	 */
267 	SM4_PREPARE(x0)
268 
269 	adr_l		x2, .Lghash_rconst
270 	ld1r		{RRCONST.2d}, [x2]
271 
272 	eor		RZERO.16b, RZERO.16b, RZERO.16b
273 
274 	/* H = E(K, 0^128) */
275 	rev32		v0.16b, RZERO.16b
276 	SM4_CRYPT_BLK_BE(v0)
277 
278 	/* H ^ 1 */
279 	rbit		RH1.16b, v0.16b
280 
281 	/* H ^ 2 */
282 	PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283 	REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
284 
285 	/* H ^ 3 */
286 	PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287 	REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
288 
289 	/* H ^ 4 */
290 	PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291 	REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
292 
293 	st1		{RH1.16b-RH4.16b}, [x1]
294 
295 	ret
296 SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
297 
298 .align 3
299 SYM_FUNC_START(pmull_ghash_update)
300 	/* input:
301 	 *   x0: ghash table
302 	 *   x1: ghash result
303 	 *   x2: src
304 	 *   w3: nblocks
305 	 */
306 	ld1		{RH1.16b-RH4.16b}, [x0]
307 
308 	ld1		{RHASH.16b}, [x1]
309 	rbit		RHASH.16b, RHASH.16b
310 
311 	adr_l		x4, .Lghash_rconst
312 	ld1r		{RRCONST.2d}, [x4]
313 
314 	eor		RZERO.16b, RZERO.16b, RZERO.16b
315 
316 .Lghash_loop_4x:
317 	cmp		w3, #4
318 	blt		.Lghash_loop_1x
319 
320 	sub		w3, w3, #4
321 
322 	ld1		{v0.16b-v3.16b}, [x2], #64
323 
324 	rbit		v0.16b, v0.16b
325 	rbit		v1.16b, v1.16b
326 	rbit		v2.16b, v2.16b
327 	rbit		v3.16b, v3.16b
328 
329 	/*
330 	 * (in0 ^ HASH) * H^4 => rr0:rr1
331 	 * (in1)        * H^3 => rr2:rr3
332 	 * (in2)        * H^2 => rr4:rr5
333 	 * (in3)        * H^1 => rr6:rr7
334 	 */
335 	eor		RHASH.16b, RHASH.16b, v0.16b
336 
337 	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338 			RR2, RR3, v1, RH3, RTMP2, RTMP3,
339 			RR4, RR5, v2, RH2, RTMP4, RTMP5,
340 			RR6, RR7, v3, RH1, RTMP6, RTMP7)
341 
342 	eor		RR0.16b, RR0.16b, RR2.16b
343 	eor		RR1.16b, RR1.16b, RR3.16b
344 	eor		RR0.16b, RR0.16b, RR4.16b
345 	eor		RR1.16b, RR1.16b, RR5.16b
346 	eor		RR0.16b, RR0.16b, RR6.16b
347 	eor		RR1.16b, RR1.16b, RR7.16b
348 
349 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
350 
351 	cbz		w3, .Lghash_end
352 	b		.Lghash_loop_4x
353 
354 .Lghash_loop_1x:
355 	sub		w3, w3, #1
356 
357 	ld1		{v0.16b}, [x2], #16
358 	rbit		v0.16b, v0.16b
359 	eor		RHASH.16b, RHASH.16b, v0.16b
360 
361 	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
363 
364 	cbnz		w3, .Lghash_loop_1x
365 
366 .Lghash_end:
367 	rbit		RHASH.16b, RHASH.16b
368 	st1		{RHASH.2d}, [x1]
369 
370 	ret
371 SYM_FUNC_END(pmull_ghash_update)
372 
373 .align 3
374 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
375 	/* input:
376 	 *   x0: round key array, CTX
377 	 *   x1: dst
378 	 *   x2: src
379 	 *   x3: ctr (big endian, 128 bit)
380 	 *   w4: nbytes
381 	 *   x5: ghash result
382 	 *   x6: ghash table
383 	 *   x7: lengths (only for last block)
384 	 */
385 	SM4_PREPARE(x0)
386 
387 	ldp		x8, x9, [x3]
388 	rev		x8, x8
389 	rev		x9, x9
390 
391 	ld1		{RH1.16b-RH4.16b}, [x6]
392 
393 	ld1		{RHASH.16b}, [x5]
394 	rbit		RHASH.16b, RHASH.16b
395 
396 	adr_l		x6, .Lghash_rconst
397 	ld1r		{RRCONST.2d}, [x6]
398 
399 	eor		RZERO.16b, RZERO.16b, RZERO.16b
400 
401 	cbz		w4, .Lgcm_enc_hash_len
402 
403 .Lgcm_enc_loop_4x:
404 	cmp		w4, #(4 * 16)
405 	blt		.Lgcm_enc_loop_1x
406 
407 	sub		w4, w4, #(4 * 16)
408 
409 	/* construct CTRs */
410 	inc32_le128(v0)			/* +0 */
411 	inc32_le128(v1)			/* +1 */
412 	inc32_le128(v2)			/* +2 */
413 	inc32_le128(v3)			/* +3 */
414 
415 	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
416 
417 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
418 
419 	eor		v0.16b, v0.16b, RTMP0.16b
420 	eor		v1.16b, v1.16b, RTMP1.16b
421 	eor		v2.16b, v2.16b, RTMP2.16b
422 	eor		v3.16b, v3.16b, RTMP3.16b
423 	st1		{v0.16b-v3.16b}, [x1], #64
424 
425 	/* ghash update */
426 
427 	rbit		v0.16b, v0.16b
428 	rbit		v1.16b, v1.16b
429 	rbit		v2.16b, v2.16b
430 	rbit		v3.16b, v3.16b
431 
432 	/*
433 	 * (in0 ^ HASH) * H^4 => rr0:rr1
434 	 * (in1)        * H^3 => rr2:rr3
435 	 * (in2)        * H^2 => rr4:rr5
436 	 * (in3)        * H^1 => rr6:rr7
437 	 */
438 	eor		RHASH.16b, RHASH.16b, v0.16b
439 
440 	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441 			RR2, RR3, v1, RH3, RTMP2, RTMP3,
442 			RR4, RR5, v2, RH2, RTMP4, RTMP5,
443 			RR6, RR7, v3, RH1, RTMP6, RTMP7)
444 
445 	eor		RR0.16b, RR0.16b, RR2.16b
446 	eor		RR1.16b, RR1.16b, RR3.16b
447 	eor		RR0.16b, RR0.16b, RR4.16b
448 	eor		RR1.16b, RR1.16b, RR5.16b
449 	eor		RR0.16b, RR0.16b, RR6.16b
450 	eor		RR1.16b, RR1.16b, RR7.16b
451 
452 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
453 
454 	cbz		w4, .Lgcm_enc_hash_len
455 	b		.Lgcm_enc_loop_4x
456 
457 .Lgcm_enc_loop_1x:
458 	cmp		w4, #16
459 	blt		.Lgcm_enc_tail
460 
461 	sub		w4, w4, #16
462 
463 	/* construct CTRs */
464 	inc32_le128(v0)
465 
466 	ld1		{RTMP0.16b}, [x2], #16
467 
468 	SM4_CRYPT_BLK(v0)
469 
470 	eor		v0.16b, v0.16b, RTMP0.16b
471 	st1		{v0.16b}, [x1], #16
472 
473 	/* ghash update */
474 	rbit		v0.16b, v0.16b
475 	eor		RHASH.16b, RHASH.16b, v0.16b
476 	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
478 
479 	cbz		w4, .Lgcm_enc_hash_len
480 	b		.Lgcm_enc_loop_1x
481 
482 .Lgcm_enc_tail:
483 	/* construct CTRs */
484 	inc32_le128(v0)
485 	SM4_CRYPT_BLK(v0)
486 
487 	/* load permute table */
488 	adr_l		x0, .Lcts_permute_table
489 	add		x0, x0, #32
490 	sub		x0, x0, w4, uxtw
491 	ld1		{v3.16b}, [x0]
492 
493 .Lgcm_enc_tail_loop:
494 	/* do encrypt */
495 	ldrb		w0, [x2], #1	/* get 1 byte from input */
496 	umov		w6, v0.b[0]	/* get top crypted byte */
497 	eor		w6, w6, w0	/* w6 = CTR ^ input */
498 	strb		w6, [x1], #1	/* store out byte */
499 
500 	/* shift right out one byte */
501 	ext		v0.16b, v0.16b, v0.16b, #1
502 	/* the last ciphertext is placed in high bytes */
503 	ins		v0.b[15], w6
504 
505 	subs		w4, w4, #1
506 	bne		.Lgcm_enc_tail_loop
507 
508 	/* padding last block with zeros */
509 	tbl		v0.16b, {v0.16b}, v3.16b
510 
511 	/* ghash update */
512 	rbit		v0.16b, v0.16b
513 	eor		RHASH.16b, RHASH.16b, v0.16b
514 	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
516 
517 .Lgcm_enc_hash_len:
518 	cbz		x7, .Lgcm_enc_end
519 
520 	GTAG_HASH_LENGTHS(v1, v3)
521 
522 	b		.Lgcm_enc_ret
523 
524 .Lgcm_enc_end:
525 	/* store new CTR */
526 	rev		x8, x8
527 	rev		x9, x9
528 	stp		x8, x9, [x3]
529 
530 	rbit		RHASH.16b, RHASH.16b
531 
532 .Lgcm_enc_ret:
533 	/* store new MAC */
534 	st1		{RHASH.2d}, [x5]
535 
536 	ret
537 SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
538 
539 #undef	RR1
540 #undef	RR3
541 #undef	RR5
542 #undef	RR7
543 #undef	RR0
544 #undef	RR2
545 #undef	RR4
546 #undef	RR6
547 #undef RTMP0
548 #undef RTMP1
549 #undef RTMP2
550 #undef RTMP3
551 #undef RTMP4
552 #undef RTMP5
553 #undef RTMP6
554 #undef RTMP7
555 #undef	RH1
556 #undef	RH2
557 #undef	RH3
558 #undef	RH4
559 
560 
561 /* Register macros for decrypt */
562 
563 /* v0-v2 for building CTRs, v3-v5 for saving inputs */
564 
565 #define	RR1	v6
566 #define	RR3	v7
567 #define	RR5	v8
568 
569 #define	RR0	v9
570 #define	RR2	v10
571 #define	RR4	v11
572 
573 #define RTMP0	v12
574 #define RTMP1	v13
575 #define RTMP2	v14
576 #define RTMP3	v15
577 #define RTMP4	v16
578 #define RTMP5	v17
579 
580 #define	RH1	v18
581 #define	RH2	v19
582 #define	RH3	v20
583 
584 .align 3
585 SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
586 	/* input:
587 	 *   x0: round key array, CTX
588 	 *   x1: dst
589 	 *   x2: src
590 	 *   x3: ctr (big endian, 128 bit)
591 	 *   w4: nbytes
592 	 *   x5: ghash result
593 	 *   x6: ghash table
594 	 *   x7: lengths (only for last block)
595 	 */
596 	SM4_PREPARE(x0)
597 
598 	ldp		x8, x9, [x3]
599 	rev		x8, x8
600 	rev		x9, x9
601 
602 	ld1		{RH1.16b-RH3.16b}, [x6]
603 
604 	ld1		{RHASH.16b}, [x5]
605 	rbit		RHASH.16b, RHASH.16b
606 
607 	adr_l		x6, .Lghash_rconst
608 	ld1r		{RRCONST.2d}, [x6]
609 
610 	eor		RZERO.16b, RZERO.16b, RZERO.16b
611 
612 	cbz		w4, .Lgcm_dec_hash_len
613 
614 .Lgcm_dec_loop_3x:
615 	cmp		w4, #(3 * 16)
616 	blt		.Lgcm_dec_loop_1x
617 
618 	sub		w4, w4, #(3 * 16)
619 
620 	ld1		{v3.16b-v5.16b}, [x2], #(3 * 16)
621 
622 	/* construct CTRs */
623 	inc32_le128(v0)			/* +0 */
624 	rbit		v6.16b, v3.16b
625 	inc32_le128(v1)			/* +1 */
626 	rbit		v7.16b, v4.16b
627 	inc32_le128(v2)			/* +2 */
628 	rbit		v8.16b, v5.16b
629 
630 	eor		RHASH.16b, RHASH.16b, v6.16b
631 
632 	/* decrypt & ghash update */
633 	SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634 				    RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635 				    RR2, RR3, v7, RH2, RTMP2, RTMP3,
636 				    RR4, RR5, v8, RH1, RTMP4, RTMP5)
637 
638 	eor		v0.16b, v0.16b, v3.16b
639 	eor		v1.16b, v1.16b, v4.16b
640 	eor		v2.16b, v2.16b, v5.16b
641 
642 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
643 
644 	st1		{v0.16b-v2.16b}, [x1], #(3 * 16)
645 
646 	cbz		w4, .Lgcm_dec_hash_len
647 	b		.Lgcm_dec_loop_3x
648 
649 .Lgcm_dec_loop_1x:
650 	cmp		w4, #16
651 	blt		.Lgcm_dec_tail
652 
653 	sub		w4, w4, #16
654 
655 	ld1		{v3.16b}, [x2], #16
656 
657 	/* construct CTRs */
658 	inc32_le128(v0)
659 	rbit		v6.16b, v3.16b
660 
661 	eor		RHASH.16b, RHASH.16b, v6.16b
662 
663 	SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
664 
665 	eor		v0.16b, v0.16b, v3.16b
666 
667 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
668 
669 	st1		{v0.16b}, [x1], #16
670 
671 	cbz		w4, .Lgcm_dec_hash_len
672 	b		.Lgcm_dec_loop_1x
673 
674 .Lgcm_dec_tail:
675 	/* construct CTRs */
676 	inc32_le128(v0)
677 	SM4_CRYPT_BLK(v0)
678 
679 	/* load permute table */
680 	adr_l		x0, .Lcts_permute_table
681 	add		x0, x0, #32
682 	sub		x0, x0, w4, uxtw
683 	ld1		{v3.16b}, [x0]
684 
685 .Lgcm_dec_tail_loop:
686 	/* do decrypt */
687 	ldrb		w0, [x2], #1	/* get 1 byte from input */
688 	umov		w6, v0.b[0]	/* get top crypted byte */
689 	eor		w6, w6, w0	/* w6 = CTR ^ input */
690 	strb		w6, [x1], #1	/* store out byte */
691 
692 	/* shift right out one byte */
693 	ext		v0.16b, v0.16b, v0.16b, #1
694 	/* the last ciphertext is placed in high bytes */
695 	ins		v0.b[15], w0
696 
697 	subs		w4, w4, #1
698 	bne		.Lgcm_dec_tail_loop
699 
700 	/* padding last block with zeros */
701 	tbl		v0.16b, {v0.16b}, v3.16b
702 
703 	/* ghash update */
704 	rbit		v0.16b, v0.16b
705 	eor		RHASH.16b, RHASH.16b, v0.16b
706 	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707 	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
708 
709 .Lgcm_dec_hash_len:
710 	cbz		x7, .Lgcm_dec_end
711 
712 	GTAG_HASH_LENGTHS(v1, v3)
713 
714 	b		.Lgcm_dec_ret
715 
716 .Lgcm_dec_end:
717 	/* store new CTR */
718 	rev		x8, x8
719 	rev		x9, x9
720 	stp		x8, x9, [x3]
721 
722 	rbit		RHASH.16b, RHASH.16b
723 
724 .Lgcm_dec_ret:
725 	/* store new MAC */
726 	st1		{RHASH.2d}, [x5]
727 
728 	ret
729 SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
730 
731 	.section	".rodata", "a"
732 	.align 4
733 .Lcts_permute_table:
734 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
737 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
738 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
740 
741 .Lghash_rconst:
742 	.quad		0x87
743