1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4 * as specified in rfc8998
5 * https://datatracker.ietf.org/doc/html/rfc8998
6 *
7 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13#include "sm4-ce-asm.h"
14
15.arch	armv8-a+crypto
16
17.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
18	.set .Lv\b\().4s, \b
19.endr
20
21.macro sm4e, vd, vn
22	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
23.endm
24
25/* Register macros */
26
27/* Used for both encryption and decryption */
28#define	RHASH	v21
29#define	RRCONST	v22
30#define RZERO	v23
31
32/* Helper macros. */
33
34/*
35 * input: m0, m1
36 * output: r0:r1 (low 128-bits in r0, high in r1)
37 */
38#define PMUL_128x128(r0, r1, m0, m1, T0, T1)			\
39		ext		T0.16b, m1.16b, m1.16b, #8;	\
40		pmull		r0.1q, m0.1d, m1.1d;		\
41		pmull		T1.1q, m0.1d, T0.1d;		\
42		pmull2		T0.1q, m0.2d, T0.2d;		\
43		pmull2		r1.1q, m0.2d, m1.2d;		\
44		eor		T0.16b, T0.16b, T1.16b;		\
45		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
46		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
47		eor		r0.16b, r0.16b, T1.16b;		\
48		eor		r1.16b, r1.16b, T0.16b;
49
50#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,			\
51			r2, r3, m2, m3, T2, T3,			\
52			r4, r5, m4, m5, T4, T5,			\
53			r6, r7, m6, m7, T6, T7)			\
54		ext		T0.16b, m1.16b, m1.16b, #8;	\
55		ext		T2.16b, m3.16b, m3.16b, #8;	\
56		ext		T4.16b, m5.16b, m5.16b, #8;	\
57		ext		T6.16b, m7.16b, m7.16b, #8;	\
58		pmull		r0.1q, m0.1d, m1.1d;		\
59		pmull		r2.1q, m2.1d, m3.1d;		\
60		pmull		r4.1q, m4.1d, m5.1d;		\
61		pmull		r6.1q, m6.1d, m7.1d;		\
62		pmull		T1.1q, m0.1d, T0.1d;		\
63		pmull		T3.1q, m2.1d, T2.1d;		\
64		pmull		T5.1q, m4.1d, T4.1d;		\
65		pmull		T7.1q, m6.1d, T6.1d;		\
66		pmull2		T0.1q, m0.2d, T0.2d;		\
67		pmull2		T2.1q, m2.2d, T2.2d;		\
68		pmull2		T4.1q, m4.2d, T4.2d;		\
69		pmull2		T6.1q, m6.2d, T6.2d;		\
70		pmull2		r1.1q, m0.2d, m1.2d;		\
71		pmull2		r3.1q, m2.2d, m3.2d;		\
72		pmull2		r5.1q, m4.2d, m5.2d;		\
73		pmull2		r7.1q, m6.2d, m7.2d;		\
74		eor		T0.16b, T0.16b, T1.16b;		\
75		eor		T2.16b, T2.16b, T3.16b;		\
76		eor		T4.16b, T4.16b, T5.16b;		\
77		eor		T6.16b, T6.16b, T7.16b;		\
78		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
79		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
80		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
81		ext		T7.16b, RZERO.16b, T6.16b, #8;	\
82		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
83		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
84		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
85		ext		T6.16b, T6.16b, RZERO.16b, #8;	\
86		eor		r0.16b, r0.16b, T1.16b;		\
87		eor		r2.16b, r2.16b, T3.16b; 	\
88		eor		r4.16b, r4.16b, T5.16b; 	\
89		eor		r6.16b, r6.16b, T7.16b; 	\
90		eor		r1.16b, r1.16b, T0.16b; 	\
91		eor		r3.16b, r3.16b, T2.16b; 	\
92		eor		r5.16b, r5.16b, T4.16b; 	\
93		eor		r7.16b, r7.16b, T6.16b;
94
95/*
96 * input: r0:r1 (low 128-bits in r0, high in r1)
97 * output: a
98 */
99#define REDUCTION(a, r0, r1, rconst, T0, T1)			\
100		pmull2		T0.1q, r1.2d, rconst.2d;	\
101		ext		T1.16b, T0.16b, RZERO.16b, #8;	\
102		ext		T0.16b, RZERO.16b, T0.16b, #8;	\
103		eor		r1.16b, r1.16b, T1.16b;		\
104		eor		r0.16b, r0.16b, T0.16b;		\
105		pmull		T0.1q, r1.1d, rconst.1d;	\
106		eor		a.16b, r0.16b, T0.16b;
107
108#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)	\
109	rev32			b0.16b, b0.16b;			\
110		ext		T0.16b, m1.16b, m1.16b, #8;	\
111	sm4e			b0.4s, v24.4s;			\
112		pmull		r0.1q, m0.1d, m1.1d;		\
113	sm4e			b0.4s, v25.4s;			\
114		pmull		T1.1q, m0.1d, T0.1d;		\
115	sm4e			b0.4s, v26.4s;			\
116		pmull2		T0.1q, m0.2d, T0.2d;		\
117	sm4e			b0.4s, v27.4s;			\
118		pmull2		r1.1q, m0.2d, m1.2d;		\
119	sm4e			b0.4s, v28.4s;			\
120		eor		T0.16b, T0.16b, T1.16b;		\
121	sm4e			b0.4s, v29.4s;			\
122		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
123	sm4e			b0.4s, v30.4s;			\
124		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
125	sm4e			b0.4s, v31.4s;			\
126		eor		r0.16b, r0.16b, T1.16b;		\
127	rev64			b0.4s, b0.4s;			\
128		eor		r1.16b, r1.16b, T0.16b;		\
129	ext			b0.16b, b0.16b, b0.16b, #8;	\
130	rev32			b0.16b, b0.16b;
131
132#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,			\
133				    r0, r1, m0, m1, T0, T1,	\
134				    r2, r3, m2, m3, T2, T3,	\
135				    r4, r5, m4, m5, T4, T5)	\
136	rev32			b0.16b, b0.16b;			\
137	rev32			b1.16b, b1.16b;			\
138	rev32			b2.16b, b2.16b;			\
139		ext		T0.16b, m1.16b, m1.16b, #8;	\
140		ext		T2.16b, m3.16b, m3.16b, #8;	\
141		ext		T4.16b, m5.16b, m5.16b, #8;	\
142	sm4e			b0.4s, v24.4s;			\
143	sm4e			b1.4s, v24.4s;			\
144	sm4e			b2.4s, v24.4s;			\
145		pmull		r0.1q, m0.1d, m1.1d;		\
146		pmull		r2.1q, m2.1d, m3.1d;		\
147		pmull		r4.1q, m4.1d, m5.1d;		\
148	sm4e			b0.4s, v25.4s;			\
149	sm4e			b1.4s, v25.4s;			\
150	sm4e			b2.4s, v25.4s;			\
151		pmull		T1.1q, m0.1d, T0.1d;		\
152		pmull		T3.1q, m2.1d, T2.1d;		\
153		pmull		T5.1q, m4.1d, T4.1d;		\
154	sm4e			b0.4s, v26.4s;			\
155	sm4e			b1.4s, v26.4s;			\
156	sm4e			b2.4s, v26.4s;			\
157		pmull2		T0.1q, m0.2d, T0.2d;		\
158		pmull2		T2.1q, m2.2d, T2.2d;		\
159		pmull2		T4.1q, m4.2d, T4.2d;		\
160	sm4e			b0.4s, v27.4s;			\
161	sm4e			b1.4s, v27.4s;			\
162	sm4e			b2.4s, v27.4s;			\
163		pmull2		r1.1q, m0.2d, m1.2d;		\
164		pmull2		r3.1q, m2.2d, m3.2d;		\
165		pmull2		r5.1q, m4.2d, m5.2d;		\
166	sm4e			b0.4s, v28.4s;			\
167	sm4e			b1.4s, v28.4s;			\
168	sm4e			b2.4s, v28.4s;			\
169		eor		T0.16b, T0.16b, T1.16b;		\
170		eor		T2.16b, T2.16b, T3.16b;		\
171		eor		T4.16b, T4.16b, T5.16b;		\
172	sm4e			b0.4s, v29.4s;			\
173	sm4e			b1.4s, v29.4s;			\
174	sm4e			b2.4s, v29.4s;			\
175		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
176		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
177		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
178	sm4e			b0.4s, v30.4s;			\
179	sm4e			b1.4s, v30.4s;			\
180	sm4e			b2.4s, v30.4s;			\
181		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
182		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
183		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
184	sm4e			b0.4s, v31.4s;			\
185	sm4e			b1.4s, v31.4s;			\
186	sm4e			b2.4s, v31.4s;			\
187		eor		r0.16b, r0.16b, T1.16b;		\
188		eor		r2.16b, r2.16b, T3.16b;		\
189		eor		r4.16b, r4.16b, T5.16b;		\
190	rev64			b0.4s, b0.4s;			\
191	rev64			b1.4s, b1.4s;			\
192	rev64			b2.4s, b2.4s;			\
193		eor		r1.16b, r1.16b, T0.16b;		\
194		eor		r3.16b, r3.16b, T2.16b;		\
195		eor		r5.16b, r5.16b, T4.16b;		\
196	ext			b0.16b, b0.16b, b0.16b, #8;	\
197	ext			b1.16b, b1.16b, b1.16b, #8;	\
198	ext			b2.16b, b2.16b, b2.16b, #8;	\
199		eor		r0.16b, r0.16b, r2.16b;		\
200		eor		r1.16b, r1.16b, r3.16b;		\
201	rev32			b0.16b, b0.16b;			\
202	rev32			b1.16b, b1.16b;			\
203	rev32			b2.16b, b2.16b;			\
204		eor		r0.16b, r0.16b, r4.16b;		\
205		eor		r1.16b, r1.16b, r5.16b;
206
207#define inc32_le128(vctr)					\
208		mov		vctr.d[1], x9;			\
209		add		w6, w9, #1;			\
210		mov		vctr.d[0], x8;			\
211		bfi		x9, x6, #0, #32;		\
212		rev64		vctr.16b, vctr.16b;
213
214#define GTAG_HASH_LENGTHS(vctr0, vlen)					\
215		ld1		{vlen.16b}, [x7];			\
216		/* construct CTR0 */					\
217		/* the lower 32-bits of initial IV is always be32(1) */	\
218		mov		x6, #0x1;				\
219		bfi		x9, x6, #0, #32;			\
220		mov		vctr0.d[0], x8;				\
221		mov		vctr0.d[1], x9;				\
222		rbit		vlen.16b, vlen.16b;			\
223		rev64		vctr0.16b, vctr0.16b;			\
224		/* authtag = GCTR(CTR0, GHASH) */			\
225		eor		RHASH.16b, RHASH.16b, vlen.16b;		\
226		SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1,	\
227					   RTMP0, RTMP1);		\
228		REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);	\
229		rbit		RHASH.16b, RHASH.16b;			\
230		eor		RHASH.16b, RHASH.16b, vctr0.16b;
231
232
233/* Register macros for encrypt and ghash */
234
235/* can be the same as input v0-v3 */
236#define	RR1	v0
237#define	RR3	v1
238#define	RR5	v2
239#define	RR7	v3
240
241#define	RR0	v4
242#define	RR2	v5
243#define	RR4	v6
244#define	RR6	v7
245
246#define RTMP0	v8
247#define RTMP1	v9
248#define RTMP2	v10
249#define RTMP3	v11
250#define RTMP4	v12
251#define RTMP5	v13
252#define RTMP6	v14
253#define RTMP7	v15
254
255#define	RH1	v16
256#define	RH2	v17
257#define	RH3	v18
258#define	RH4	v19
259
260.align 3
261SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
262	/* input:
263	 *   x0: round key array, CTX
264	 *   x1: ghash table
265	 */
266	SM4_PREPARE(x0)
267
268	adr_l		x2, .Lghash_rconst
269	ld1r		{RRCONST.2d}, [x2]
270
271	eor		RZERO.16b, RZERO.16b, RZERO.16b
272
273	/* H = E(K, 0^128) */
274	rev32		v0.16b, RZERO.16b
275	SM4_CRYPT_BLK_BE(v0)
276
277	/* H ^ 1 */
278	rbit		RH1.16b, v0.16b
279
280	/* H ^ 2 */
281	PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
282	REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
283
284	/* H ^ 3 */
285	PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
286	REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
287
288	/* H ^ 4 */
289	PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
290	REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
291
292	st1		{RH1.16b-RH4.16b}, [x1]
293
294	ret
295SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
296
297.align 3
298SYM_FUNC_START(pmull_ghash_update)
299	/* input:
300	 *   x0: ghash table
301	 *   x1: ghash result
302	 *   x2: src
303	 *   w3: nblocks
304	 */
305	ld1		{RH1.16b-RH4.16b}, [x0]
306
307	ld1		{RHASH.16b}, [x1]
308	rbit		RHASH.16b, RHASH.16b
309
310	adr_l		x4, .Lghash_rconst
311	ld1r		{RRCONST.2d}, [x4]
312
313	eor		RZERO.16b, RZERO.16b, RZERO.16b
314
315.Lghash_loop_4x:
316	cmp		w3, #4
317	blt		.Lghash_loop_1x
318
319	sub		w3, w3, #4
320
321	ld1		{v0.16b-v3.16b}, [x2], #64
322
323	rbit		v0.16b, v0.16b
324	rbit		v1.16b, v1.16b
325	rbit		v2.16b, v2.16b
326	rbit		v3.16b, v3.16b
327
328	/*
329	 * (in0 ^ HASH) * H^4 => rr0:rr1
330	 * (in1)        * H^3 => rr2:rr3
331	 * (in2)        * H^2 => rr4:rr5
332	 * (in3)        * H^1 => rr6:rr7
333	 */
334	eor		RHASH.16b, RHASH.16b, v0.16b
335
336	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
337			RR2, RR3, v1, RH3, RTMP2, RTMP3,
338			RR4, RR5, v2, RH2, RTMP4, RTMP5,
339			RR6, RR7, v3, RH1, RTMP6, RTMP7)
340
341	eor		RR0.16b, RR0.16b, RR2.16b
342	eor		RR1.16b, RR1.16b, RR3.16b
343	eor		RR0.16b, RR0.16b, RR4.16b
344	eor		RR1.16b, RR1.16b, RR5.16b
345	eor		RR0.16b, RR0.16b, RR6.16b
346	eor		RR1.16b, RR1.16b, RR7.16b
347
348	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
349
350	cbz		w3, .Lghash_end
351	b		.Lghash_loop_4x
352
353.Lghash_loop_1x:
354	sub		w3, w3, #1
355
356	ld1		{v0.16b}, [x2], #16
357	rbit		v0.16b, v0.16b
358	eor		RHASH.16b, RHASH.16b, v0.16b
359
360	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
361	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
362
363	cbnz		w3, .Lghash_loop_1x
364
365.Lghash_end:
366	rbit		RHASH.16b, RHASH.16b
367	st1		{RHASH.2d}, [x1]
368
369	ret
370SYM_FUNC_END(pmull_ghash_update)
371
372.align 3
373SYM_FUNC_START(sm4_ce_pmull_gcm_enc)
374	/* input:
375	 *   x0: round key array, CTX
376	 *   x1: dst
377	 *   x2: src
378	 *   x3: ctr (big endian, 128 bit)
379	 *   w4: nbytes
380	 *   x5: ghash result
381	 *   x6: ghash table
382	 *   x7: lengths (only for last block)
383	 */
384	SM4_PREPARE(x0)
385
386	ldp		x8, x9, [x3]
387	rev		x8, x8
388	rev		x9, x9
389
390	ld1		{RH1.16b-RH4.16b}, [x6]
391
392	ld1		{RHASH.16b}, [x5]
393	rbit		RHASH.16b, RHASH.16b
394
395	adr_l		x6, .Lghash_rconst
396	ld1r		{RRCONST.2d}, [x6]
397
398	eor		RZERO.16b, RZERO.16b, RZERO.16b
399
400	cbz		w4, .Lgcm_enc_hash_len
401
402.Lgcm_enc_loop_4x:
403	cmp		w4, #(4 * 16)
404	blt		.Lgcm_enc_loop_1x
405
406	sub		w4, w4, #(4 * 16)
407
408	/* construct CTRs */
409	inc32_le128(v0)			/* +0 */
410	inc32_le128(v1)			/* +1 */
411	inc32_le128(v2)			/* +2 */
412	inc32_le128(v3)			/* +3 */
413
414	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
415
416	SM4_CRYPT_BLK4(v0, v1, v2, v3)
417
418	eor		v0.16b, v0.16b, RTMP0.16b
419	eor		v1.16b, v1.16b, RTMP1.16b
420	eor		v2.16b, v2.16b, RTMP2.16b
421	eor		v3.16b, v3.16b, RTMP3.16b
422	st1		{v0.16b-v3.16b}, [x1], #64
423
424	/* ghash update */
425
426	rbit		v0.16b, v0.16b
427	rbit		v1.16b, v1.16b
428	rbit		v2.16b, v2.16b
429	rbit		v3.16b, v3.16b
430
431	/*
432	 * (in0 ^ HASH) * H^4 => rr0:rr1
433	 * (in1)        * H^3 => rr2:rr3
434	 * (in2)        * H^2 => rr4:rr5
435	 * (in3)        * H^1 => rr6:rr7
436	 */
437	eor		RHASH.16b, RHASH.16b, v0.16b
438
439	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
440			RR2, RR3, v1, RH3, RTMP2, RTMP3,
441			RR4, RR5, v2, RH2, RTMP4, RTMP5,
442			RR6, RR7, v3, RH1, RTMP6, RTMP7)
443
444	eor		RR0.16b, RR0.16b, RR2.16b
445	eor		RR1.16b, RR1.16b, RR3.16b
446	eor		RR0.16b, RR0.16b, RR4.16b
447	eor		RR1.16b, RR1.16b, RR5.16b
448	eor		RR0.16b, RR0.16b, RR6.16b
449	eor		RR1.16b, RR1.16b, RR7.16b
450
451	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
452
453	cbz		w4, .Lgcm_enc_hash_len
454	b		.Lgcm_enc_loop_4x
455
456.Lgcm_enc_loop_1x:
457	cmp		w4, #16
458	blt		.Lgcm_enc_tail
459
460	sub		w4, w4, #16
461
462	/* construct CTRs */
463	inc32_le128(v0)
464
465	ld1		{RTMP0.16b}, [x2], #16
466
467	SM4_CRYPT_BLK(v0)
468
469	eor		v0.16b, v0.16b, RTMP0.16b
470	st1		{v0.16b}, [x1], #16
471
472	/* ghash update */
473	rbit		v0.16b, v0.16b
474	eor		RHASH.16b, RHASH.16b, v0.16b
475	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
476	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
477
478	cbz		w4, .Lgcm_enc_hash_len
479	b		.Lgcm_enc_loop_1x
480
481.Lgcm_enc_tail:
482	/* construct CTRs */
483	inc32_le128(v0)
484	SM4_CRYPT_BLK(v0)
485
486	/* load permute table */
487	adr_l		x0, .Lcts_permute_table
488	add		x0, x0, #32
489	sub		x0, x0, w4, uxtw
490	ld1		{v3.16b}, [x0]
491
492.Lgcm_enc_tail_loop:
493	/* do encrypt */
494	ldrb		w0, [x2], #1	/* get 1 byte from input */
495	umov		w6, v0.b[0]	/* get top crypted byte */
496	eor		w6, w6, w0	/* w6 = CTR ^ input */
497	strb		w6, [x1], #1	/* store out byte */
498
499	/* shift right out one byte */
500	ext		v0.16b, v0.16b, v0.16b, #1
501	/* the last ciphertext is placed in high bytes */
502	ins		v0.b[15], w6
503
504	subs		w4, w4, #1
505	bne		.Lgcm_enc_tail_loop
506
507	/* padding last block with zeros */
508	tbl		v0.16b, {v0.16b}, v3.16b
509
510	/* ghash update */
511	rbit		v0.16b, v0.16b
512	eor		RHASH.16b, RHASH.16b, v0.16b
513	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
514	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
515
516.Lgcm_enc_hash_len:
517	cbz		x7, .Lgcm_enc_end
518
519	GTAG_HASH_LENGTHS(v1, v3)
520
521	b		.Lgcm_enc_ret
522
523.Lgcm_enc_end:
524	/* store new CTR */
525	rev		x8, x8
526	rev		x9, x9
527	stp		x8, x9, [x3]
528
529	rbit		RHASH.16b, RHASH.16b
530
531.Lgcm_enc_ret:
532	/* store new MAC */
533	st1		{RHASH.2d}, [x5]
534
535	ret
536SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
537
538#undef	RR1
539#undef	RR3
540#undef	RR5
541#undef	RR7
542#undef	RR0
543#undef	RR2
544#undef	RR4
545#undef	RR6
546#undef RTMP0
547#undef RTMP1
548#undef RTMP2
549#undef RTMP3
550#undef RTMP4
551#undef RTMP5
552#undef RTMP6
553#undef RTMP7
554#undef	RH1
555#undef	RH2
556#undef	RH3
557#undef	RH4
558
559
560/* Register macros for decrypt */
561
562/* v0-v2 for building CTRs, v3-v5 for saving inputs */
563
564#define	RR1	v6
565#define	RR3	v7
566#define	RR5	v8
567
568#define	RR0	v9
569#define	RR2	v10
570#define	RR4	v11
571
572#define RTMP0	v12
573#define RTMP1	v13
574#define RTMP2	v14
575#define RTMP3	v15
576#define RTMP4	v16
577#define RTMP5	v17
578
579#define	RH1	v18
580#define	RH2	v19
581#define	RH3	v20
582
583.align 3
584SYM_FUNC_START(sm4_ce_pmull_gcm_dec)
585	/* input:
586	 *   x0: round key array, CTX
587	 *   x1: dst
588	 *   x2: src
589	 *   x3: ctr (big endian, 128 bit)
590	 *   w4: nbytes
591	 *   x5: ghash result
592	 *   x6: ghash table
593	 *   x7: lengths (only for last block)
594	 */
595	SM4_PREPARE(x0)
596
597	ldp		x8, x9, [x3]
598	rev		x8, x8
599	rev		x9, x9
600
601	ld1		{RH1.16b-RH3.16b}, [x6]
602
603	ld1		{RHASH.16b}, [x5]
604	rbit		RHASH.16b, RHASH.16b
605
606	adr_l		x6, .Lghash_rconst
607	ld1r		{RRCONST.2d}, [x6]
608
609	eor		RZERO.16b, RZERO.16b, RZERO.16b
610
611	cbz		w4, .Lgcm_dec_hash_len
612
613.Lgcm_dec_loop_3x:
614	cmp		w4, #(3 * 16)
615	blt		.Lgcm_dec_loop_1x
616
617	sub		w4, w4, #(3 * 16)
618
619	ld1		{v3.16b-v5.16b}, [x2], #(3 * 16)
620
621	/* construct CTRs */
622	inc32_le128(v0)			/* +0 */
623	rbit		v6.16b, v3.16b
624	inc32_le128(v1)			/* +1 */
625	rbit		v7.16b, v4.16b
626	inc32_le128(v2)			/* +2 */
627	rbit		v8.16b, v5.16b
628
629	eor		RHASH.16b, RHASH.16b, v6.16b
630
631	/* decrypt & ghash update */
632	SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
633				    RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
634				    RR2, RR3, v7, RH2, RTMP2, RTMP3,
635				    RR4, RR5, v8, RH1, RTMP4, RTMP5)
636
637	eor		v0.16b, v0.16b, v3.16b
638	eor		v1.16b, v1.16b, v4.16b
639	eor		v2.16b, v2.16b, v5.16b
640
641	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
642
643	st1		{v0.16b-v2.16b}, [x1], #(3 * 16)
644
645	cbz		w4, .Lgcm_dec_hash_len
646	b		.Lgcm_dec_loop_3x
647
648.Lgcm_dec_loop_1x:
649	cmp		w4, #16
650	blt		.Lgcm_dec_tail
651
652	sub		w4, w4, #16
653
654	ld1		{v3.16b}, [x2], #16
655
656	/* construct CTRs */
657	inc32_le128(v0)
658	rbit		v6.16b, v3.16b
659
660	eor		RHASH.16b, RHASH.16b, v6.16b
661
662	SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
663
664	eor		v0.16b, v0.16b, v3.16b
665
666	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
667
668	st1		{v0.16b}, [x1], #16
669
670	cbz		w4, .Lgcm_dec_hash_len
671	b		.Lgcm_dec_loop_1x
672
673.Lgcm_dec_tail:
674	/* construct CTRs */
675	inc32_le128(v0)
676	SM4_CRYPT_BLK(v0)
677
678	/* load permute table */
679	adr_l		x0, .Lcts_permute_table
680	add		x0, x0, #32
681	sub		x0, x0, w4, uxtw
682	ld1		{v3.16b}, [x0]
683
684.Lgcm_dec_tail_loop:
685	/* do decrypt */
686	ldrb		w0, [x2], #1	/* get 1 byte from input */
687	umov		w6, v0.b[0]	/* get top crypted byte */
688	eor		w6, w6, w0	/* w6 = CTR ^ input */
689	strb		w6, [x1], #1	/* store out byte */
690
691	/* shift right out one byte */
692	ext		v0.16b, v0.16b, v0.16b, #1
693	/* the last ciphertext is placed in high bytes */
694	ins		v0.b[15], w0
695
696	subs		w4, w4, #1
697	bne		.Lgcm_dec_tail_loop
698
699	/* padding last block with zeros */
700	tbl		v0.16b, {v0.16b}, v3.16b
701
702	/* ghash update */
703	rbit		v0.16b, v0.16b
704	eor		RHASH.16b, RHASH.16b, v0.16b
705	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
706	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
707
708.Lgcm_dec_hash_len:
709	cbz		x7, .Lgcm_dec_end
710
711	GTAG_HASH_LENGTHS(v1, v3)
712
713	b		.Lgcm_dec_ret
714
715.Lgcm_dec_end:
716	/* store new CTR */
717	rev		x8, x8
718	rev		x9, x9
719	stp		x8, x9, [x3]
720
721	rbit		RHASH.16b, RHASH.16b
722
723.Lgcm_dec_ret:
724	/* store new MAC */
725	st1		{RHASH.2d}, [x5]
726
727	ret
728SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
729
730	.section	".rodata", "a"
731	.align 4
732.Lcts_permute_table:
733	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
734	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
736	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
737	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
738	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739
740.Lghash_rconst:
741	.quad		0x87
742