1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
4 * as specified in rfc8998
5 * https://datatracker.ietf.org/doc/html/rfc8998
6 *
7 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <linux/cfi_types.h>
13#include <asm/assembler.h>
14#include "sm4-ce-asm.h"
15
16.arch	armv8-a+crypto
17
18.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
19	.set .Lv\b\().4s, \b
20.endr
21
22.macro sm4e, vd, vn
23	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24.endm
25
26/* Register macros */
27
28/* Used for both encryption and decryption */
29#define	RHASH	v21
30#define	RRCONST	v22
31#define RZERO	v23
32
33/* Helper macros. */
34
35/*
36 * input: m0, m1
37 * output: r0:r1 (low 128-bits in r0, high in r1)
38 */
39#define PMUL_128x128(r0, r1, m0, m1, T0, T1)			\
40		ext		T0.16b, m1.16b, m1.16b, #8;	\
41		pmull		r0.1q, m0.1d, m1.1d;		\
42		pmull		T1.1q, m0.1d, T0.1d;		\
43		pmull2		T0.1q, m0.2d, T0.2d;		\
44		pmull2		r1.1q, m0.2d, m1.2d;		\
45		eor		T0.16b, T0.16b, T1.16b;		\
46		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
47		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
48		eor		r0.16b, r0.16b, T1.16b;		\
49		eor		r1.16b, r1.16b, T0.16b;
50
51#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,			\
52			r2, r3, m2, m3, T2, T3,			\
53			r4, r5, m4, m5, T4, T5,			\
54			r6, r7, m6, m7, T6, T7)			\
55		ext		T0.16b, m1.16b, m1.16b, #8;	\
56		ext		T2.16b, m3.16b, m3.16b, #8;	\
57		ext		T4.16b, m5.16b, m5.16b, #8;	\
58		ext		T6.16b, m7.16b, m7.16b, #8;	\
59		pmull		r0.1q, m0.1d, m1.1d;		\
60		pmull		r2.1q, m2.1d, m3.1d;		\
61		pmull		r4.1q, m4.1d, m5.1d;		\
62		pmull		r6.1q, m6.1d, m7.1d;		\
63		pmull		T1.1q, m0.1d, T0.1d;		\
64		pmull		T3.1q, m2.1d, T2.1d;		\
65		pmull		T5.1q, m4.1d, T4.1d;		\
66		pmull		T7.1q, m6.1d, T6.1d;		\
67		pmull2		T0.1q, m0.2d, T0.2d;		\
68		pmull2		T2.1q, m2.2d, T2.2d;		\
69		pmull2		T4.1q, m4.2d, T4.2d;		\
70		pmull2		T6.1q, m6.2d, T6.2d;		\
71		pmull2		r1.1q, m0.2d, m1.2d;		\
72		pmull2		r3.1q, m2.2d, m3.2d;		\
73		pmull2		r5.1q, m4.2d, m5.2d;		\
74		pmull2		r7.1q, m6.2d, m7.2d;		\
75		eor		T0.16b, T0.16b, T1.16b;		\
76		eor		T2.16b, T2.16b, T3.16b;		\
77		eor		T4.16b, T4.16b, T5.16b;		\
78		eor		T6.16b, T6.16b, T7.16b;		\
79		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
80		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
81		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
82		ext		T7.16b, RZERO.16b, T6.16b, #8;	\
83		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
84		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
85		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
86		ext		T6.16b, T6.16b, RZERO.16b, #8;	\
87		eor		r0.16b, r0.16b, T1.16b;		\
88		eor		r2.16b, r2.16b, T3.16b; 	\
89		eor		r4.16b, r4.16b, T5.16b; 	\
90		eor		r6.16b, r6.16b, T7.16b; 	\
91		eor		r1.16b, r1.16b, T0.16b; 	\
92		eor		r3.16b, r3.16b, T2.16b; 	\
93		eor		r5.16b, r5.16b, T4.16b; 	\
94		eor		r7.16b, r7.16b, T6.16b;
95
96/*
97 * input: r0:r1 (low 128-bits in r0, high in r1)
98 * output: a
99 */
100#define REDUCTION(a, r0, r1, rconst, T0, T1)			\
101		pmull2		T0.1q, r1.2d, rconst.2d;	\
102		ext		T1.16b, T0.16b, RZERO.16b, #8;	\
103		ext		T0.16b, RZERO.16b, T0.16b, #8;	\
104		eor		r1.16b, r1.16b, T1.16b;		\
105		eor		r0.16b, r0.16b, T0.16b;		\
106		pmull		T0.1q, r1.1d, rconst.1d;	\
107		eor		a.16b, r0.16b, T0.16b;
108
109#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)	\
110	rev32			b0.16b, b0.16b;			\
111		ext		T0.16b, m1.16b, m1.16b, #8;	\
112	sm4e			b0.4s, v24.4s;			\
113		pmull		r0.1q, m0.1d, m1.1d;		\
114	sm4e			b0.4s, v25.4s;			\
115		pmull		T1.1q, m0.1d, T0.1d;		\
116	sm4e			b0.4s, v26.4s;			\
117		pmull2		T0.1q, m0.2d, T0.2d;		\
118	sm4e			b0.4s, v27.4s;			\
119		pmull2		r1.1q, m0.2d, m1.2d;		\
120	sm4e			b0.4s, v28.4s;			\
121		eor		T0.16b, T0.16b, T1.16b;		\
122	sm4e			b0.4s, v29.4s;			\
123		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
124	sm4e			b0.4s, v30.4s;			\
125		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
126	sm4e			b0.4s, v31.4s;			\
127		eor		r0.16b, r0.16b, T1.16b;		\
128	rev64			b0.4s, b0.4s;			\
129		eor		r1.16b, r1.16b, T0.16b;		\
130	ext			b0.16b, b0.16b, b0.16b, #8;	\
131	rev32			b0.16b, b0.16b;
132
133#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,			\
134				    r0, r1, m0, m1, T0, T1,	\
135				    r2, r3, m2, m3, T2, T3,	\
136				    r4, r5, m4, m5, T4, T5)	\
137	rev32			b0.16b, b0.16b;			\
138	rev32			b1.16b, b1.16b;			\
139	rev32			b2.16b, b2.16b;			\
140		ext		T0.16b, m1.16b, m1.16b, #8;	\
141		ext		T2.16b, m3.16b, m3.16b, #8;	\
142		ext		T4.16b, m5.16b, m5.16b, #8;	\
143	sm4e			b0.4s, v24.4s;			\
144	sm4e			b1.4s, v24.4s;			\
145	sm4e			b2.4s, v24.4s;			\
146		pmull		r0.1q, m0.1d, m1.1d;		\
147		pmull		r2.1q, m2.1d, m3.1d;		\
148		pmull		r4.1q, m4.1d, m5.1d;		\
149	sm4e			b0.4s, v25.4s;			\
150	sm4e			b1.4s, v25.4s;			\
151	sm4e			b2.4s, v25.4s;			\
152		pmull		T1.1q, m0.1d, T0.1d;		\
153		pmull		T3.1q, m2.1d, T2.1d;		\
154		pmull		T5.1q, m4.1d, T4.1d;		\
155	sm4e			b0.4s, v26.4s;			\
156	sm4e			b1.4s, v26.4s;			\
157	sm4e			b2.4s, v26.4s;			\
158		pmull2		T0.1q, m0.2d, T0.2d;		\
159		pmull2		T2.1q, m2.2d, T2.2d;		\
160		pmull2		T4.1q, m4.2d, T4.2d;		\
161	sm4e			b0.4s, v27.4s;			\
162	sm4e			b1.4s, v27.4s;			\
163	sm4e			b2.4s, v27.4s;			\
164		pmull2		r1.1q, m0.2d, m1.2d;		\
165		pmull2		r3.1q, m2.2d, m3.2d;		\
166		pmull2		r5.1q, m4.2d, m5.2d;		\
167	sm4e			b0.4s, v28.4s;			\
168	sm4e			b1.4s, v28.4s;			\
169	sm4e			b2.4s, v28.4s;			\
170		eor		T0.16b, T0.16b, T1.16b;		\
171		eor		T2.16b, T2.16b, T3.16b;		\
172		eor		T4.16b, T4.16b, T5.16b;		\
173	sm4e			b0.4s, v29.4s;			\
174	sm4e			b1.4s, v29.4s;			\
175	sm4e			b2.4s, v29.4s;			\
176		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
177		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
178		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
179	sm4e			b0.4s, v30.4s;			\
180	sm4e			b1.4s, v30.4s;			\
181	sm4e			b2.4s, v30.4s;			\
182		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
183		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
184		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
185	sm4e			b0.4s, v31.4s;			\
186	sm4e			b1.4s, v31.4s;			\
187	sm4e			b2.4s, v31.4s;			\
188		eor		r0.16b, r0.16b, T1.16b;		\
189		eor		r2.16b, r2.16b, T3.16b;		\
190		eor		r4.16b, r4.16b, T5.16b;		\
191	rev64			b0.4s, b0.4s;			\
192	rev64			b1.4s, b1.4s;			\
193	rev64			b2.4s, b2.4s;			\
194		eor		r1.16b, r1.16b, T0.16b;		\
195		eor		r3.16b, r3.16b, T2.16b;		\
196		eor		r5.16b, r5.16b, T4.16b;		\
197	ext			b0.16b, b0.16b, b0.16b, #8;	\
198	ext			b1.16b, b1.16b, b1.16b, #8;	\
199	ext			b2.16b, b2.16b, b2.16b, #8;	\
200		eor		r0.16b, r0.16b, r2.16b;		\
201		eor		r1.16b, r1.16b, r3.16b;		\
202	rev32			b0.16b, b0.16b;			\
203	rev32			b1.16b, b1.16b;			\
204	rev32			b2.16b, b2.16b;			\
205		eor		r0.16b, r0.16b, r4.16b;		\
206		eor		r1.16b, r1.16b, r5.16b;
207
208#define inc32_le128(vctr)					\
209		mov		vctr.d[1], x9;			\
210		add		w6, w9, #1;			\
211		mov		vctr.d[0], x8;			\
212		bfi		x9, x6, #0, #32;		\
213		rev64		vctr.16b, vctr.16b;
214
215#define GTAG_HASH_LENGTHS(vctr0, vlen)					\
216		ld1		{vlen.16b}, [x7];			\
217		/* construct CTR0 */					\
218		/* the lower 32-bits of initial IV is always be32(1) */	\
219		mov		x6, #0x1;				\
220		bfi		x9, x6, #0, #32;			\
221		mov		vctr0.d[0], x8;				\
222		mov		vctr0.d[1], x9;				\
223		rbit		vlen.16b, vlen.16b;			\
224		rev64		vctr0.16b, vctr0.16b;			\
225		/* authtag = GCTR(CTR0, GHASH) */			\
226		eor		RHASH.16b, RHASH.16b, vlen.16b;		\
227		SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1,	\
228					   RTMP0, RTMP1);		\
229		REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);	\
230		rbit		RHASH.16b, RHASH.16b;			\
231		eor		RHASH.16b, RHASH.16b, vctr0.16b;
232
233
234/* Register macros for encrypt and ghash */
235
236/* can be the same as input v0-v3 */
237#define	RR1	v0
238#define	RR3	v1
239#define	RR5	v2
240#define	RR7	v3
241
242#define	RR0	v4
243#define	RR2	v5
244#define	RR4	v6
245#define	RR6	v7
246
247#define RTMP0	v8
248#define RTMP1	v9
249#define RTMP2	v10
250#define RTMP3	v11
251#define RTMP4	v12
252#define RTMP5	v13
253#define RTMP6	v14
254#define RTMP7	v15
255
256#define	RH1	v16
257#define	RH2	v17
258#define	RH3	v18
259#define	RH4	v19
260
261.align 3
262SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
263	/* input:
264	 *   x0: round key array, CTX
265	 *   x1: ghash table
266	 */
267	SM4_PREPARE(x0)
268
269	adr_l		x2, .Lghash_rconst
270	ld1r		{RRCONST.2d}, [x2]
271
272	eor		RZERO.16b, RZERO.16b, RZERO.16b
273
274	/* H = E(K, 0^128) */
275	rev32		v0.16b, RZERO.16b
276	SM4_CRYPT_BLK_BE(v0)
277
278	/* H ^ 1 */
279	rbit		RH1.16b, v0.16b
280
281	/* H ^ 2 */
282	PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
283	REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
284
285	/* H ^ 3 */
286	PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
287	REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
288
289	/* H ^ 4 */
290	PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
291	REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
292
293	st1		{RH1.16b-RH4.16b}, [x1]
294
295	ret
296SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
297
298.align 3
299SYM_FUNC_START(pmull_ghash_update)
300	/* input:
301	 *   x0: ghash table
302	 *   x1: ghash result
303	 *   x2: src
304	 *   w3: nblocks
305	 */
306	ld1		{RH1.16b-RH4.16b}, [x0]
307
308	ld1		{RHASH.16b}, [x1]
309	rbit		RHASH.16b, RHASH.16b
310
311	adr_l		x4, .Lghash_rconst
312	ld1r		{RRCONST.2d}, [x4]
313
314	eor		RZERO.16b, RZERO.16b, RZERO.16b
315
316.Lghash_loop_4x:
317	cmp		w3, #4
318	blt		.Lghash_loop_1x
319
320	sub		w3, w3, #4
321
322	ld1		{v0.16b-v3.16b}, [x2], #64
323
324	rbit		v0.16b, v0.16b
325	rbit		v1.16b, v1.16b
326	rbit		v2.16b, v2.16b
327	rbit		v3.16b, v3.16b
328
329	/*
330	 * (in0 ^ HASH) * H^4 => rr0:rr1
331	 * (in1)        * H^3 => rr2:rr3
332	 * (in2)        * H^2 => rr4:rr5
333	 * (in3)        * H^1 => rr6:rr7
334	 */
335	eor		RHASH.16b, RHASH.16b, v0.16b
336
337	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
338			RR2, RR3, v1, RH3, RTMP2, RTMP3,
339			RR4, RR5, v2, RH2, RTMP4, RTMP5,
340			RR6, RR7, v3, RH1, RTMP6, RTMP7)
341
342	eor		RR0.16b, RR0.16b, RR2.16b
343	eor		RR1.16b, RR1.16b, RR3.16b
344	eor		RR0.16b, RR0.16b, RR4.16b
345	eor		RR1.16b, RR1.16b, RR5.16b
346	eor		RR0.16b, RR0.16b, RR6.16b
347	eor		RR1.16b, RR1.16b, RR7.16b
348
349	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
350
351	cbz		w3, .Lghash_end
352	b		.Lghash_loop_4x
353
354.Lghash_loop_1x:
355	sub		w3, w3, #1
356
357	ld1		{v0.16b}, [x2], #16
358	rbit		v0.16b, v0.16b
359	eor		RHASH.16b, RHASH.16b, v0.16b
360
361	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
362	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
363
364	cbnz		w3, .Lghash_loop_1x
365
366.Lghash_end:
367	rbit		RHASH.16b, RHASH.16b
368	st1		{RHASH.2d}, [x1]
369
370	ret
371SYM_FUNC_END(pmull_ghash_update)
372
373.align 3
374SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
375	/* input:
376	 *   x0: round key array, CTX
377	 *   x1: dst
378	 *   x2: src
379	 *   x3: ctr (big endian, 128 bit)
380	 *   w4: nbytes
381	 *   x5: ghash result
382	 *   x6: ghash table
383	 *   x7: lengths (only for last block)
384	 */
385	SM4_PREPARE(x0)
386
387	ldp		x8, x9, [x3]
388	rev		x8, x8
389	rev		x9, x9
390
391	ld1		{RH1.16b-RH4.16b}, [x6]
392
393	ld1		{RHASH.16b}, [x5]
394	rbit		RHASH.16b, RHASH.16b
395
396	adr_l		x6, .Lghash_rconst
397	ld1r		{RRCONST.2d}, [x6]
398
399	eor		RZERO.16b, RZERO.16b, RZERO.16b
400
401	cbz		w4, .Lgcm_enc_hash_len
402
403.Lgcm_enc_loop_4x:
404	cmp		w4, #(4 * 16)
405	blt		.Lgcm_enc_loop_1x
406
407	sub		w4, w4, #(4 * 16)
408
409	/* construct CTRs */
410	inc32_le128(v0)			/* +0 */
411	inc32_le128(v1)			/* +1 */
412	inc32_le128(v2)			/* +2 */
413	inc32_le128(v3)			/* +3 */
414
415	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
416
417	SM4_CRYPT_BLK4(v0, v1, v2, v3)
418
419	eor		v0.16b, v0.16b, RTMP0.16b
420	eor		v1.16b, v1.16b, RTMP1.16b
421	eor		v2.16b, v2.16b, RTMP2.16b
422	eor		v3.16b, v3.16b, RTMP3.16b
423	st1		{v0.16b-v3.16b}, [x1], #64
424
425	/* ghash update */
426
427	rbit		v0.16b, v0.16b
428	rbit		v1.16b, v1.16b
429	rbit		v2.16b, v2.16b
430	rbit		v3.16b, v3.16b
431
432	/*
433	 * (in0 ^ HASH) * H^4 => rr0:rr1
434	 * (in1)        * H^3 => rr2:rr3
435	 * (in2)        * H^2 => rr4:rr5
436	 * (in3)        * H^1 => rr6:rr7
437	 */
438	eor		RHASH.16b, RHASH.16b, v0.16b
439
440	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
441			RR2, RR3, v1, RH3, RTMP2, RTMP3,
442			RR4, RR5, v2, RH2, RTMP4, RTMP5,
443			RR6, RR7, v3, RH1, RTMP6, RTMP7)
444
445	eor		RR0.16b, RR0.16b, RR2.16b
446	eor		RR1.16b, RR1.16b, RR3.16b
447	eor		RR0.16b, RR0.16b, RR4.16b
448	eor		RR1.16b, RR1.16b, RR5.16b
449	eor		RR0.16b, RR0.16b, RR6.16b
450	eor		RR1.16b, RR1.16b, RR7.16b
451
452	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
453
454	cbz		w4, .Lgcm_enc_hash_len
455	b		.Lgcm_enc_loop_4x
456
457.Lgcm_enc_loop_1x:
458	cmp		w4, #16
459	blt		.Lgcm_enc_tail
460
461	sub		w4, w4, #16
462
463	/* construct CTRs */
464	inc32_le128(v0)
465
466	ld1		{RTMP0.16b}, [x2], #16
467
468	SM4_CRYPT_BLK(v0)
469
470	eor		v0.16b, v0.16b, RTMP0.16b
471	st1		{v0.16b}, [x1], #16
472
473	/* ghash update */
474	rbit		v0.16b, v0.16b
475	eor		RHASH.16b, RHASH.16b, v0.16b
476	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
477	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
478
479	cbz		w4, .Lgcm_enc_hash_len
480	b		.Lgcm_enc_loop_1x
481
482.Lgcm_enc_tail:
483	/* construct CTRs */
484	inc32_le128(v0)
485	SM4_CRYPT_BLK(v0)
486
487	/* load permute table */
488	adr_l		x0, .Lcts_permute_table
489	add		x0, x0, #32
490	sub		x0, x0, w4, uxtw
491	ld1		{v3.16b}, [x0]
492
493.Lgcm_enc_tail_loop:
494	/* do encrypt */
495	ldrb		w0, [x2], #1	/* get 1 byte from input */
496	umov		w6, v0.b[0]	/* get top crypted byte */
497	eor		w6, w6, w0	/* w6 = CTR ^ input */
498	strb		w6, [x1], #1	/* store out byte */
499
500	/* shift right out one byte */
501	ext		v0.16b, v0.16b, v0.16b, #1
502	/* the last ciphertext is placed in high bytes */
503	ins		v0.b[15], w6
504
505	subs		w4, w4, #1
506	bne		.Lgcm_enc_tail_loop
507
508	/* padding last block with zeros */
509	tbl		v0.16b, {v0.16b}, v3.16b
510
511	/* ghash update */
512	rbit		v0.16b, v0.16b
513	eor		RHASH.16b, RHASH.16b, v0.16b
514	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
515	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
516
517.Lgcm_enc_hash_len:
518	cbz		x7, .Lgcm_enc_end
519
520	GTAG_HASH_LENGTHS(v1, v3)
521
522	b		.Lgcm_enc_ret
523
524.Lgcm_enc_end:
525	/* store new CTR */
526	rev		x8, x8
527	rev		x9, x9
528	stp		x8, x9, [x3]
529
530	rbit		RHASH.16b, RHASH.16b
531
532.Lgcm_enc_ret:
533	/* store new MAC */
534	st1		{RHASH.2d}, [x5]
535
536	ret
537SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
538
539#undef	RR1
540#undef	RR3
541#undef	RR5
542#undef	RR7
543#undef	RR0
544#undef	RR2
545#undef	RR4
546#undef	RR6
547#undef RTMP0
548#undef RTMP1
549#undef RTMP2
550#undef RTMP3
551#undef RTMP4
552#undef RTMP5
553#undef RTMP6
554#undef RTMP7
555#undef	RH1
556#undef	RH2
557#undef	RH3
558#undef	RH4
559
560
561/* Register macros for decrypt */
562
563/* v0-v2 for building CTRs, v3-v5 for saving inputs */
564
565#define	RR1	v6
566#define	RR3	v7
567#define	RR5	v8
568
569#define	RR0	v9
570#define	RR2	v10
571#define	RR4	v11
572
573#define RTMP0	v12
574#define RTMP1	v13
575#define RTMP2	v14
576#define RTMP3	v15
577#define RTMP4	v16
578#define RTMP5	v17
579
580#define	RH1	v18
581#define	RH2	v19
582#define	RH3	v20
583
584.align 3
585SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
586	/* input:
587	 *   x0: round key array, CTX
588	 *   x1: dst
589	 *   x2: src
590	 *   x3: ctr (big endian, 128 bit)
591	 *   w4: nbytes
592	 *   x5: ghash result
593	 *   x6: ghash table
594	 *   x7: lengths (only for last block)
595	 */
596	SM4_PREPARE(x0)
597
598	ldp		x8, x9, [x3]
599	rev		x8, x8
600	rev		x9, x9
601
602	ld1		{RH1.16b-RH3.16b}, [x6]
603
604	ld1		{RHASH.16b}, [x5]
605	rbit		RHASH.16b, RHASH.16b
606
607	adr_l		x6, .Lghash_rconst
608	ld1r		{RRCONST.2d}, [x6]
609
610	eor		RZERO.16b, RZERO.16b, RZERO.16b
611
612	cbz		w4, .Lgcm_dec_hash_len
613
614.Lgcm_dec_loop_3x:
615	cmp		w4, #(3 * 16)
616	blt		.Lgcm_dec_loop_1x
617
618	sub		w4, w4, #(3 * 16)
619
620	ld1		{v3.16b-v5.16b}, [x2], #(3 * 16)
621
622	/* construct CTRs */
623	inc32_le128(v0)			/* +0 */
624	rbit		v6.16b, v3.16b
625	inc32_le128(v1)			/* +1 */
626	rbit		v7.16b, v4.16b
627	inc32_le128(v2)			/* +2 */
628	rbit		v8.16b, v5.16b
629
630	eor		RHASH.16b, RHASH.16b, v6.16b
631
632	/* decrypt & ghash update */
633	SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
634				    RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
635				    RR2, RR3, v7, RH2, RTMP2, RTMP3,
636				    RR4, RR5, v8, RH1, RTMP4, RTMP5)
637
638	eor		v0.16b, v0.16b, v3.16b
639	eor		v1.16b, v1.16b, v4.16b
640	eor		v2.16b, v2.16b, v5.16b
641
642	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
643
644	st1		{v0.16b-v2.16b}, [x1], #(3 * 16)
645
646	cbz		w4, .Lgcm_dec_hash_len
647	b		.Lgcm_dec_loop_3x
648
649.Lgcm_dec_loop_1x:
650	cmp		w4, #16
651	blt		.Lgcm_dec_tail
652
653	sub		w4, w4, #16
654
655	ld1		{v3.16b}, [x2], #16
656
657	/* construct CTRs */
658	inc32_le128(v0)
659	rbit		v6.16b, v3.16b
660
661	eor		RHASH.16b, RHASH.16b, v6.16b
662
663	SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
664
665	eor		v0.16b, v0.16b, v3.16b
666
667	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
668
669	st1		{v0.16b}, [x1], #16
670
671	cbz		w4, .Lgcm_dec_hash_len
672	b		.Lgcm_dec_loop_1x
673
674.Lgcm_dec_tail:
675	/* construct CTRs */
676	inc32_le128(v0)
677	SM4_CRYPT_BLK(v0)
678
679	/* load permute table */
680	adr_l		x0, .Lcts_permute_table
681	add		x0, x0, #32
682	sub		x0, x0, w4, uxtw
683	ld1		{v3.16b}, [x0]
684
685.Lgcm_dec_tail_loop:
686	/* do decrypt */
687	ldrb		w0, [x2], #1	/* get 1 byte from input */
688	umov		w6, v0.b[0]	/* get top crypted byte */
689	eor		w6, w6, w0	/* w6 = CTR ^ input */
690	strb		w6, [x1], #1	/* store out byte */
691
692	/* shift right out one byte */
693	ext		v0.16b, v0.16b, v0.16b, #1
694	/* the last ciphertext is placed in high bytes */
695	ins		v0.b[15], w0
696
697	subs		w4, w4, #1
698	bne		.Lgcm_dec_tail_loop
699
700	/* padding last block with zeros */
701	tbl		v0.16b, {v0.16b}, v3.16b
702
703	/* ghash update */
704	rbit		v0.16b, v0.16b
705	eor		RHASH.16b, RHASH.16b, v0.16b
706	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
707	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
708
709.Lgcm_dec_hash_len:
710	cbz		x7, .Lgcm_dec_end
711
712	GTAG_HASH_LENGTHS(v1, v3)
713
714	b		.Lgcm_dec_ret
715
716.Lgcm_dec_end:
717	/* store new CTR */
718	rev		x8, x8
719	rev		x9, x9
720	stp		x8, x9, [x3]
721
722	rbit		RHASH.16b, RHASH.16b
723
724.Lgcm_dec_ret:
725	/* store new MAC */
726	st1		{RHASH.2d}, [x5]
727
728	ret
729SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
730
731	.section	".rodata", "a"
732	.align 4
733.Lcts_permute_table:
734	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
735	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
736	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
737	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
738	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
739	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
740
741.Lghash_rconst:
742	.quad		0x87
743