1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
4 * as specified in rfc8998
5 * https://datatracker.ietf.org/doc/html/rfc8998
6 *
7 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
8 */
9
10#include <linux/linkage.h>
11#include <asm/assembler.h>
12#include "sm4-ce-asm.h"
13
14.arch	armv8-a+crypto
15
16.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31
17	.set .Lv\b\().4s, \b
18.endr
19
20.macro sm4e, vd, vn
21	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
22.endm
23
24/* Register macros */
25
26#define RMAC	v16
27
28/* Helper macros. */
29
30#define inc_le128(vctr)					\
31		mov		vctr.d[1], x8;		\
32		mov		vctr.d[0], x7;		\
33		adds		x8, x8, #1;		\
34		rev64		vctr.16b, vctr.16b;	\
35		adc		x7, x7, xzr;
36
37
38.align 3
39SYM_FUNC_START(sm4_ce_cbcmac_update)
40	/* input:
41	 *   x0: round key array, CTX
42	 *   x1: mac
43	 *   x2: src
44	 *   w3: nblocks
45	 */
46	SM4_PREPARE(x0)
47
48	ld1		{RMAC.16b}, [x1]
49
50.Lcbcmac_loop_4x:
51	cmp		w3, #4
52	blt		.Lcbcmac_loop_1x
53
54	sub		w3, w3, #4
55
56	ld1		{v0.16b-v3.16b}, [x2], #64
57
58	SM4_CRYPT_BLK(RMAC)
59	eor		RMAC.16b, RMAC.16b, v0.16b
60	SM4_CRYPT_BLK(RMAC)
61	eor		RMAC.16b, RMAC.16b, v1.16b
62	SM4_CRYPT_BLK(RMAC)
63	eor		RMAC.16b, RMAC.16b, v2.16b
64	SM4_CRYPT_BLK(RMAC)
65	eor		RMAC.16b, RMAC.16b, v3.16b
66
67	cbz		w3, .Lcbcmac_end
68	b		.Lcbcmac_loop_4x
69
70.Lcbcmac_loop_1x:
71	sub		w3, w3, #1
72
73	ld1		{v0.16b}, [x2], #16
74
75	SM4_CRYPT_BLK(RMAC)
76	eor		RMAC.16b, RMAC.16b, v0.16b
77
78	cbnz		w3, .Lcbcmac_loop_1x
79
80.Lcbcmac_end:
81	st1		{RMAC.16b}, [x1]
82	ret
83SYM_FUNC_END(sm4_ce_cbcmac_update)
84
85.align 3
86SYM_FUNC_START(sm4_ce_ccm_final)
87	/* input:
88	 *   x0: round key array, CTX
89	 *   x1: ctr0 (big endian, 128 bit)
90	 *   x2: mac
91	 */
92	SM4_PREPARE(x0)
93
94	ld1		{RMAC.16b}, [x2]
95	ld1		{v0.16b}, [x1]
96
97	SM4_CRYPT_BLK2(RMAC, v0)
98
99	/* en-/decrypt the mac with ctr0 */
100	eor		RMAC.16b, RMAC.16b, v0.16b
101	st1		{RMAC.16b}, [x2]
102
103	ret
104SYM_FUNC_END(sm4_ce_ccm_final)
105
106.align 3
107SYM_FUNC_START(sm4_ce_ccm_enc)
108	/* input:
109	 *   x0: round key array, CTX
110	 *   x1: dst
111	 *   x2: src
112	 *   x3: ctr (big endian, 128 bit)
113	 *   w4: nbytes
114	 *   x5: mac
115	 */
116	SM4_PREPARE(x0)
117
118	ldp		x7, x8, [x3]
119	rev		x7, x7
120	rev		x8, x8
121
122	ld1		{RMAC.16b}, [x5]
123
124.Lccm_enc_loop_4x:
125	cmp		w4, #(4 * 16)
126	blt		.Lccm_enc_loop_1x
127
128	sub		w4, w4, #(4 * 16)
129
130	/* construct CTRs */
131	inc_le128(v8)			/* +0 */
132	inc_le128(v9)			/* +1 */
133	inc_le128(v10)			/* +2 */
134	inc_le128(v11)			/* +3 */
135
136	ld1		{v0.16b-v3.16b}, [x2], #64
137
138	SM4_CRYPT_BLK2(v8, RMAC)
139	eor		v8.16b, v8.16b, v0.16b
140	eor		RMAC.16b, RMAC.16b, v0.16b
141	SM4_CRYPT_BLK2(v9, RMAC)
142	eor		v9.16b, v9.16b, v1.16b
143	eor		RMAC.16b, RMAC.16b, v1.16b
144	SM4_CRYPT_BLK2(v10, RMAC)
145	eor		v10.16b, v10.16b, v2.16b
146	eor		RMAC.16b, RMAC.16b, v2.16b
147	SM4_CRYPT_BLK2(v11, RMAC)
148	eor		v11.16b, v11.16b, v3.16b
149	eor		RMAC.16b, RMAC.16b, v3.16b
150
151	st1		{v8.16b-v11.16b}, [x1], #64
152
153	cbz		w4, .Lccm_enc_end
154	b		.Lccm_enc_loop_4x
155
156.Lccm_enc_loop_1x:
157	cmp		w4, #16
158	blt		.Lccm_enc_tail
159
160	sub		w4, w4, #16
161
162	/* construct CTRs */
163	inc_le128(v8)
164
165	ld1		{v0.16b}, [x2], #16
166
167	SM4_CRYPT_BLK2(v8, RMAC)
168	eor		v8.16b, v8.16b, v0.16b
169	eor		RMAC.16b, RMAC.16b, v0.16b
170
171	st1		{v8.16b}, [x1], #16
172
173	cbz		w4, .Lccm_enc_end
174	b		.Lccm_enc_loop_1x
175
176.Lccm_enc_tail:
177	/* construct CTRs */
178	inc_le128(v8)
179
180	SM4_CRYPT_BLK2(RMAC, v8)
181
182	/* store new MAC */
183	st1		{RMAC.16b}, [x5]
184
185.Lccm_enc_tail_loop:
186	ldrb		w0, [x2], #1		/* get 1 byte from input */
187	umov		w9, v8.b[0]		/* get top crypted CTR byte */
188	umov		w6, RMAC.b[0]		/* get top MAC byte */
189
190	eor		w9, w9, w0		/* w9 = CTR ^ input */
191	eor		w6, w6, w0		/* w6 = MAC ^ input */
192
193	strb		w9, [x1], #1		/* store out byte */
194	strb		w6, [x5], #1		/* store MAC byte */
195
196	subs		w4, w4, #1
197	beq		.Lccm_enc_ret
198
199	/* shift out one byte */
200	ext		RMAC.16b, RMAC.16b, RMAC.16b, #1
201	ext		v8.16b, v8.16b, v8.16b, #1
202
203	b		.Lccm_enc_tail_loop
204
205.Lccm_enc_end:
206	/* store new MAC */
207	st1		{RMAC.16b}, [x5]
208
209	/* store new CTR */
210	rev		x7, x7
211	rev		x8, x8
212	stp		x7, x8, [x3]
213
214.Lccm_enc_ret:
215	ret
216SYM_FUNC_END(sm4_ce_ccm_enc)
217
218.align 3
219SYM_FUNC_START(sm4_ce_ccm_dec)
220	/* input:
221	 *   x0: round key array, CTX
222	 *   x1: dst
223	 *   x2: src
224	 *   x3: ctr (big endian, 128 bit)
225	 *   w4: nbytes
226	 *   x5: mac
227	 */
228	SM4_PREPARE(x0)
229
230	ldp		x7, x8, [x3]
231	rev		x7, x7
232	rev		x8, x8
233
234	ld1		{RMAC.16b}, [x5]
235
236.Lccm_dec_loop_4x:
237	cmp		w4, #(4 * 16)
238	blt		.Lccm_dec_loop_1x
239
240	sub		w4, w4, #(4 * 16)
241
242	/* construct CTRs */
243	inc_le128(v8)			/* +0 */
244	inc_le128(v9)			/* +1 */
245	inc_le128(v10)			/* +2 */
246	inc_le128(v11)			/* +3 */
247
248	ld1		{v0.16b-v3.16b}, [x2], #64
249
250	SM4_CRYPT_BLK2(v8, RMAC)
251	eor		v8.16b, v8.16b, v0.16b
252	eor		RMAC.16b, RMAC.16b, v8.16b
253	SM4_CRYPT_BLK2(v9, RMAC)
254	eor		v9.16b, v9.16b, v1.16b
255	eor		RMAC.16b, RMAC.16b, v9.16b
256	SM4_CRYPT_BLK2(v10, RMAC)
257	eor		v10.16b, v10.16b, v2.16b
258	eor		RMAC.16b, RMAC.16b, v10.16b
259	SM4_CRYPT_BLK2(v11, RMAC)
260	eor		v11.16b, v11.16b, v3.16b
261	eor		RMAC.16b, RMAC.16b, v11.16b
262
263	st1		{v8.16b-v11.16b}, [x1], #64
264
265	cbz		w4, .Lccm_dec_end
266	b		.Lccm_dec_loop_4x
267
268.Lccm_dec_loop_1x:
269	cmp		w4, #16
270	blt		.Lccm_dec_tail
271
272	sub		w4, w4, #16
273
274	/* construct CTRs */
275	inc_le128(v8)
276
277	ld1		{v0.16b}, [x2], #16
278
279	SM4_CRYPT_BLK2(v8, RMAC)
280	eor		v8.16b, v8.16b, v0.16b
281	eor		RMAC.16b, RMAC.16b, v8.16b
282
283	st1		{v8.16b}, [x1], #16
284
285	cbz		w4, .Lccm_dec_end
286	b		.Lccm_dec_loop_1x
287
288.Lccm_dec_tail:
289	/* construct CTRs */
290	inc_le128(v8)
291
292	SM4_CRYPT_BLK2(RMAC, v8)
293
294	/* store new MAC */
295	st1		{RMAC.16b}, [x5]
296
297.Lccm_dec_tail_loop:
298	ldrb		w0, [x2], #1		/* get 1 byte from input */
299	umov		w9, v8.b[0]		/* get top crypted CTR byte */
300	umov		w6, RMAC.b[0]		/* get top MAC byte */
301
302	eor		w9, w9, w0		/* w9 = CTR ^ input */
303	eor		w6, w6, w9		/* w6 = MAC ^ output */
304
305	strb		w9, [x1], #1		/* store out byte */
306	strb		w6, [x5], #1		/* store MAC byte */
307
308	subs		w4, w4, #1
309	beq		.Lccm_dec_ret
310
311	/* shift out one byte */
312	ext		RMAC.16b, RMAC.16b, RMAC.16b, #1
313	ext		v8.16b, v8.16b, v8.16b, #1
314
315	b		.Lccm_dec_tail_loop
316
317.Lccm_dec_end:
318	/* store new MAC */
319	st1		{RMAC.16b}, [x5]
320
321	/* store new CTR */
322	rev		x7, x7
323	rev		x8, x8
324	stp		x7, x8, [x3]
325
326.Lccm_dec_ret:
327	ret
328SYM_FUNC_END(sm4_ce_ccm_dec)
329