1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
4 * as specified in rfc8998
5 * https://datatracker.ietf.org/doc/html/rfc8998
6 *
7 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
8 */
9
10#include <linux/linkage.h>
11#include <linux/cfi_types.h>
12#include <asm/assembler.h>
13#include "sm4-ce-asm.h"
14
15.arch	armv8-a+crypto
16
17.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31
18	.set .Lv\b\().4s, \b
19.endr
20
21.macro sm4e, vd, vn
22	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
23.endm
24
25/* Register macros */
26
27#define RMAC	v16
28
29/* Helper macros. */
30
31#define inc_le128(vctr)					\
32		mov		vctr.d[1], x8;		\
33		mov		vctr.d[0], x7;		\
34		adds		x8, x8, #1;		\
35		rev64		vctr.16b, vctr.16b;	\
36		adc		x7, x7, xzr;
37
38
39.align 3
40SYM_FUNC_START(sm4_ce_cbcmac_update)
41	/* input:
42	 *   x0: round key array, CTX
43	 *   x1: mac
44	 *   x2: src
45	 *   w3: nblocks
46	 */
47	SM4_PREPARE(x0)
48
49	ld1		{RMAC.16b}, [x1]
50
51.Lcbcmac_loop_4x:
52	cmp		w3, #4
53	blt		.Lcbcmac_loop_1x
54
55	sub		w3, w3, #4
56
57	ld1		{v0.16b-v3.16b}, [x2], #64
58
59	SM4_CRYPT_BLK(RMAC)
60	eor		RMAC.16b, RMAC.16b, v0.16b
61	SM4_CRYPT_BLK(RMAC)
62	eor		RMAC.16b, RMAC.16b, v1.16b
63	SM4_CRYPT_BLK(RMAC)
64	eor		RMAC.16b, RMAC.16b, v2.16b
65	SM4_CRYPT_BLK(RMAC)
66	eor		RMAC.16b, RMAC.16b, v3.16b
67
68	cbz		w3, .Lcbcmac_end
69	b		.Lcbcmac_loop_4x
70
71.Lcbcmac_loop_1x:
72	sub		w3, w3, #1
73
74	ld1		{v0.16b}, [x2], #16
75
76	SM4_CRYPT_BLK(RMAC)
77	eor		RMAC.16b, RMAC.16b, v0.16b
78
79	cbnz		w3, .Lcbcmac_loop_1x
80
81.Lcbcmac_end:
82	st1		{RMAC.16b}, [x1]
83	ret
84SYM_FUNC_END(sm4_ce_cbcmac_update)
85
86.align 3
87SYM_FUNC_START(sm4_ce_ccm_final)
88	/* input:
89	 *   x0: round key array, CTX
90	 *   x1: ctr0 (big endian, 128 bit)
91	 *   x2: mac
92	 */
93	SM4_PREPARE(x0)
94
95	ld1		{RMAC.16b}, [x2]
96	ld1		{v0.16b}, [x1]
97
98	SM4_CRYPT_BLK2(RMAC, v0)
99
100	/* en-/decrypt the mac with ctr0 */
101	eor		RMAC.16b, RMAC.16b, v0.16b
102	st1		{RMAC.16b}, [x2]
103
104	ret
105SYM_FUNC_END(sm4_ce_ccm_final)
106
107.align 3
108SYM_TYPED_FUNC_START(sm4_ce_ccm_enc)
109	/* input:
110	 *   x0: round key array, CTX
111	 *   x1: dst
112	 *   x2: src
113	 *   x3: ctr (big endian, 128 bit)
114	 *   w4: nbytes
115	 *   x5: mac
116	 */
117	SM4_PREPARE(x0)
118
119	ldp		x7, x8, [x3]
120	rev		x7, x7
121	rev		x8, x8
122
123	ld1		{RMAC.16b}, [x5]
124
125.Lccm_enc_loop_4x:
126	cmp		w4, #(4 * 16)
127	blt		.Lccm_enc_loop_1x
128
129	sub		w4, w4, #(4 * 16)
130
131	/* construct CTRs */
132	inc_le128(v8)			/* +0 */
133	inc_le128(v9)			/* +1 */
134	inc_le128(v10)			/* +2 */
135	inc_le128(v11)			/* +3 */
136
137	ld1		{v0.16b-v3.16b}, [x2], #64
138
139	SM4_CRYPT_BLK2(v8, RMAC)
140	eor		v8.16b, v8.16b, v0.16b
141	eor		RMAC.16b, RMAC.16b, v0.16b
142	SM4_CRYPT_BLK2(v9, RMAC)
143	eor		v9.16b, v9.16b, v1.16b
144	eor		RMAC.16b, RMAC.16b, v1.16b
145	SM4_CRYPT_BLK2(v10, RMAC)
146	eor		v10.16b, v10.16b, v2.16b
147	eor		RMAC.16b, RMAC.16b, v2.16b
148	SM4_CRYPT_BLK2(v11, RMAC)
149	eor		v11.16b, v11.16b, v3.16b
150	eor		RMAC.16b, RMAC.16b, v3.16b
151
152	st1		{v8.16b-v11.16b}, [x1], #64
153
154	cbz		w4, .Lccm_enc_end
155	b		.Lccm_enc_loop_4x
156
157.Lccm_enc_loop_1x:
158	cmp		w4, #16
159	blt		.Lccm_enc_tail
160
161	sub		w4, w4, #16
162
163	/* construct CTRs */
164	inc_le128(v8)
165
166	ld1		{v0.16b}, [x2], #16
167
168	SM4_CRYPT_BLK2(v8, RMAC)
169	eor		v8.16b, v8.16b, v0.16b
170	eor		RMAC.16b, RMAC.16b, v0.16b
171
172	st1		{v8.16b}, [x1], #16
173
174	cbz		w4, .Lccm_enc_end
175	b		.Lccm_enc_loop_1x
176
177.Lccm_enc_tail:
178	/* construct CTRs */
179	inc_le128(v8)
180
181	SM4_CRYPT_BLK2(RMAC, v8)
182
183	/* store new MAC */
184	st1		{RMAC.16b}, [x5]
185
186.Lccm_enc_tail_loop:
187	ldrb		w0, [x2], #1		/* get 1 byte from input */
188	umov		w9, v8.b[0]		/* get top crypted CTR byte */
189	umov		w6, RMAC.b[0]		/* get top MAC byte */
190
191	eor		w9, w9, w0		/* w9 = CTR ^ input */
192	eor		w6, w6, w0		/* w6 = MAC ^ input */
193
194	strb		w9, [x1], #1		/* store out byte */
195	strb		w6, [x5], #1		/* store MAC byte */
196
197	subs		w4, w4, #1
198	beq		.Lccm_enc_ret
199
200	/* shift out one byte */
201	ext		RMAC.16b, RMAC.16b, RMAC.16b, #1
202	ext		v8.16b, v8.16b, v8.16b, #1
203
204	b		.Lccm_enc_tail_loop
205
206.Lccm_enc_end:
207	/* store new MAC */
208	st1		{RMAC.16b}, [x5]
209
210	/* store new CTR */
211	rev		x7, x7
212	rev		x8, x8
213	stp		x7, x8, [x3]
214
215.Lccm_enc_ret:
216	ret
217SYM_FUNC_END(sm4_ce_ccm_enc)
218
219.align 3
220SYM_TYPED_FUNC_START(sm4_ce_ccm_dec)
221	/* input:
222	 *   x0: round key array, CTX
223	 *   x1: dst
224	 *   x2: src
225	 *   x3: ctr (big endian, 128 bit)
226	 *   w4: nbytes
227	 *   x5: mac
228	 */
229	SM4_PREPARE(x0)
230
231	ldp		x7, x8, [x3]
232	rev		x7, x7
233	rev		x8, x8
234
235	ld1		{RMAC.16b}, [x5]
236
237.Lccm_dec_loop_4x:
238	cmp		w4, #(4 * 16)
239	blt		.Lccm_dec_loop_1x
240
241	sub		w4, w4, #(4 * 16)
242
243	/* construct CTRs */
244	inc_le128(v8)			/* +0 */
245	inc_le128(v9)			/* +1 */
246	inc_le128(v10)			/* +2 */
247	inc_le128(v11)			/* +3 */
248
249	ld1		{v0.16b-v3.16b}, [x2], #64
250
251	SM4_CRYPT_BLK2(v8, RMAC)
252	eor		v8.16b, v8.16b, v0.16b
253	eor		RMAC.16b, RMAC.16b, v8.16b
254	SM4_CRYPT_BLK2(v9, RMAC)
255	eor		v9.16b, v9.16b, v1.16b
256	eor		RMAC.16b, RMAC.16b, v9.16b
257	SM4_CRYPT_BLK2(v10, RMAC)
258	eor		v10.16b, v10.16b, v2.16b
259	eor		RMAC.16b, RMAC.16b, v10.16b
260	SM4_CRYPT_BLK2(v11, RMAC)
261	eor		v11.16b, v11.16b, v3.16b
262	eor		RMAC.16b, RMAC.16b, v11.16b
263
264	st1		{v8.16b-v11.16b}, [x1], #64
265
266	cbz		w4, .Lccm_dec_end
267	b		.Lccm_dec_loop_4x
268
269.Lccm_dec_loop_1x:
270	cmp		w4, #16
271	blt		.Lccm_dec_tail
272
273	sub		w4, w4, #16
274
275	/* construct CTRs */
276	inc_le128(v8)
277
278	ld1		{v0.16b}, [x2], #16
279
280	SM4_CRYPT_BLK2(v8, RMAC)
281	eor		v8.16b, v8.16b, v0.16b
282	eor		RMAC.16b, RMAC.16b, v8.16b
283
284	st1		{v8.16b}, [x1], #16
285
286	cbz		w4, .Lccm_dec_end
287	b		.Lccm_dec_loop_1x
288
289.Lccm_dec_tail:
290	/* construct CTRs */
291	inc_le128(v8)
292
293	SM4_CRYPT_BLK2(RMAC, v8)
294
295	/* store new MAC */
296	st1		{RMAC.16b}, [x5]
297
298.Lccm_dec_tail_loop:
299	ldrb		w0, [x2], #1		/* get 1 byte from input */
300	umov		w9, v8.b[0]		/* get top crypted CTR byte */
301	umov		w6, RMAC.b[0]		/* get top MAC byte */
302
303	eor		w9, w9, w0		/* w9 = CTR ^ input */
304	eor		w6, w6, w9		/* w6 = MAC ^ output */
305
306	strb		w9, [x1], #1		/* store out byte */
307	strb		w6, [x5], #1		/* store MAC byte */
308
309	subs		w4, w4, #1
310	beq		.Lccm_dec_ret
311
312	/* shift out one byte */
313	ext		RMAC.16b, RMAC.16b, RMAC.16b, #1
314	ext		v8.16b, v8.16b, v8.16b, #1
315
316	b		.Lccm_dec_tail_loop
317
318.Lccm_dec_end:
319	/* store new MAC */
320	st1		{RMAC.16b}, [x5]
321
322	/* store new CTR */
323	rev		x7, x7
324	rev		x8, x8
325	stp		x7, x8, [x3]
326
327.Lccm_dec_ret:
328	ret
329SYM_FUNC_END(sm4_ce_ccm_dec)
330