167fa3a7fSTianjia Zhang/* SPDX-License-Identifier: GPL-2.0-or-later */
267fa3a7fSTianjia Zhang/*
367fa3a7fSTianjia Zhang * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
467fa3a7fSTianjia Zhang * as specified in rfc8998
567fa3a7fSTianjia Zhang * https://datatracker.ietf.org/doc/html/rfc8998
667fa3a7fSTianjia Zhang *
767fa3a7fSTianjia Zhang * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
867fa3a7fSTianjia Zhang */
967fa3a7fSTianjia Zhang
1067fa3a7fSTianjia Zhang#include <linux/linkage.h>
11*736f8868STianjia Zhang#include <linux/cfi_types.h>
1267fa3a7fSTianjia Zhang#include <asm/assembler.h>
1367fa3a7fSTianjia Zhang#include "sm4-ce-asm.h"
1467fa3a7fSTianjia Zhang
1567fa3a7fSTianjia Zhang.arch	armv8-a+crypto
1667fa3a7fSTianjia Zhang
1767fa3a7fSTianjia Zhang.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31
1867fa3a7fSTianjia Zhang	.set .Lv\b\().4s, \b
1967fa3a7fSTianjia Zhang.endr
2067fa3a7fSTianjia Zhang
2167fa3a7fSTianjia Zhang.macro sm4e, vd, vn
2267fa3a7fSTianjia Zhang	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
2367fa3a7fSTianjia Zhang.endm
2467fa3a7fSTianjia Zhang
2567fa3a7fSTianjia Zhang/* Register macros */
2667fa3a7fSTianjia Zhang
2767fa3a7fSTianjia Zhang#define RMAC	v16
2867fa3a7fSTianjia Zhang
2967fa3a7fSTianjia Zhang/* Helper macros. */
3067fa3a7fSTianjia Zhang
3167fa3a7fSTianjia Zhang#define inc_le128(vctr)					\
3267fa3a7fSTianjia Zhang		mov		vctr.d[1], x8;		\
3367fa3a7fSTianjia Zhang		mov		vctr.d[0], x7;		\
3467fa3a7fSTianjia Zhang		adds		x8, x8, #1;		\
3567fa3a7fSTianjia Zhang		rev64		vctr.16b, vctr.16b;	\
3667fa3a7fSTianjia Zhang		adc		x7, x7, xzr;
3767fa3a7fSTianjia Zhang
3867fa3a7fSTianjia Zhang
3967fa3a7fSTianjia Zhang.align 3
4067fa3a7fSTianjia ZhangSYM_FUNC_START(sm4_ce_cbcmac_update)
4167fa3a7fSTianjia Zhang	/* input:
4267fa3a7fSTianjia Zhang	 *   x0: round key array, CTX
4367fa3a7fSTianjia Zhang	 *   x1: mac
4467fa3a7fSTianjia Zhang	 *   x2: src
4567fa3a7fSTianjia Zhang	 *   w3: nblocks
4667fa3a7fSTianjia Zhang	 */
4767fa3a7fSTianjia Zhang	SM4_PREPARE(x0)
4867fa3a7fSTianjia Zhang
4967fa3a7fSTianjia Zhang	ld1		{RMAC.16b}, [x1]
5067fa3a7fSTianjia Zhang
5167fa3a7fSTianjia Zhang.Lcbcmac_loop_4x:
5267fa3a7fSTianjia Zhang	cmp		w3, #4
5367fa3a7fSTianjia Zhang	blt		.Lcbcmac_loop_1x
5467fa3a7fSTianjia Zhang
5567fa3a7fSTianjia Zhang	sub		w3, w3, #4
5667fa3a7fSTianjia Zhang
5767fa3a7fSTianjia Zhang	ld1		{v0.16b-v3.16b}, [x2], #64
5867fa3a7fSTianjia Zhang
5967fa3a7fSTianjia Zhang	SM4_CRYPT_BLK(RMAC)
6067fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v0.16b
6167fa3a7fSTianjia Zhang	SM4_CRYPT_BLK(RMAC)
6267fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v1.16b
6367fa3a7fSTianjia Zhang	SM4_CRYPT_BLK(RMAC)
6467fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v2.16b
6567fa3a7fSTianjia Zhang	SM4_CRYPT_BLK(RMAC)
6667fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v3.16b
6767fa3a7fSTianjia Zhang
6867fa3a7fSTianjia Zhang	cbz		w3, .Lcbcmac_end
6967fa3a7fSTianjia Zhang	b		.Lcbcmac_loop_4x
7067fa3a7fSTianjia Zhang
7167fa3a7fSTianjia Zhang.Lcbcmac_loop_1x:
7267fa3a7fSTianjia Zhang	sub		w3, w3, #1
7367fa3a7fSTianjia Zhang
7467fa3a7fSTianjia Zhang	ld1		{v0.16b}, [x2], #16
7567fa3a7fSTianjia Zhang
7667fa3a7fSTianjia Zhang	SM4_CRYPT_BLK(RMAC)
7767fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v0.16b
7867fa3a7fSTianjia Zhang
7967fa3a7fSTianjia Zhang	cbnz		w3, .Lcbcmac_loop_1x
8067fa3a7fSTianjia Zhang
8167fa3a7fSTianjia Zhang.Lcbcmac_end:
8267fa3a7fSTianjia Zhang	st1		{RMAC.16b}, [x1]
8367fa3a7fSTianjia Zhang	ret
8467fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_cbcmac_update)
8567fa3a7fSTianjia Zhang
8667fa3a7fSTianjia Zhang.align 3
8767fa3a7fSTianjia ZhangSYM_FUNC_START(sm4_ce_ccm_final)
8867fa3a7fSTianjia Zhang	/* input:
8967fa3a7fSTianjia Zhang	 *   x0: round key array, CTX
9067fa3a7fSTianjia Zhang	 *   x1: ctr0 (big endian, 128 bit)
9167fa3a7fSTianjia Zhang	 *   x2: mac
9267fa3a7fSTianjia Zhang	 */
9367fa3a7fSTianjia Zhang	SM4_PREPARE(x0)
9467fa3a7fSTianjia Zhang
9567fa3a7fSTianjia Zhang	ld1		{RMAC.16b}, [x2]
9667fa3a7fSTianjia Zhang	ld1		{v0.16b}, [x1]
9767fa3a7fSTianjia Zhang
9867fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(RMAC, v0)
9967fa3a7fSTianjia Zhang
10067fa3a7fSTianjia Zhang	/* en-/decrypt the mac with ctr0 */
10167fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v0.16b
10267fa3a7fSTianjia Zhang	st1		{RMAC.16b}, [x2]
10367fa3a7fSTianjia Zhang
10467fa3a7fSTianjia Zhang	ret
10567fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_ccm_final)
10667fa3a7fSTianjia Zhang
10767fa3a7fSTianjia Zhang.align 3
108*736f8868STianjia ZhangSYM_TYPED_FUNC_START(sm4_ce_ccm_enc)
10967fa3a7fSTianjia Zhang	/* input:
11067fa3a7fSTianjia Zhang	 *   x0: round key array, CTX
11167fa3a7fSTianjia Zhang	 *   x1: dst
11267fa3a7fSTianjia Zhang	 *   x2: src
11367fa3a7fSTianjia Zhang	 *   x3: ctr (big endian, 128 bit)
11467fa3a7fSTianjia Zhang	 *   w4: nbytes
11567fa3a7fSTianjia Zhang	 *   x5: mac
11667fa3a7fSTianjia Zhang	 */
11767fa3a7fSTianjia Zhang	SM4_PREPARE(x0)
11867fa3a7fSTianjia Zhang
11967fa3a7fSTianjia Zhang	ldp		x7, x8, [x3]
12067fa3a7fSTianjia Zhang	rev		x7, x7
12167fa3a7fSTianjia Zhang	rev		x8, x8
12267fa3a7fSTianjia Zhang
12367fa3a7fSTianjia Zhang	ld1		{RMAC.16b}, [x5]
12467fa3a7fSTianjia Zhang
12567fa3a7fSTianjia Zhang.Lccm_enc_loop_4x:
12667fa3a7fSTianjia Zhang	cmp		w4, #(4 * 16)
12767fa3a7fSTianjia Zhang	blt		.Lccm_enc_loop_1x
12867fa3a7fSTianjia Zhang
12967fa3a7fSTianjia Zhang	sub		w4, w4, #(4 * 16)
13067fa3a7fSTianjia Zhang
13167fa3a7fSTianjia Zhang	/* construct CTRs */
13267fa3a7fSTianjia Zhang	inc_le128(v8)			/* +0 */
13367fa3a7fSTianjia Zhang	inc_le128(v9)			/* +1 */
13467fa3a7fSTianjia Zhang	inc_le128(v10)			/* +2 */
13567fa3a7fSTianjia Zhang	inc_le128(v11)			/* +3 */
13667fa3a7fSTianjia Zhang
13767fa3a7fSTianjia Zhang	ld1		{v0.16b-v3.16b}, [x2], #64
13867fa3a7fSTianjia Zhang
13967fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v8, RMAC)
14067fa3a7fSTianjia Zhang	eor		v8.16b, v8.16b, v0.16b
14167fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v0.16b
14267fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v9, RMAC)
14367fa3a7fSTianjia Zhang	eor		v9.16b, v9.16b, v1.16b
14467fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v1.16b
14567fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v10, RMAC)
14667fa3a7fSTianjia Zhang	eor		v10.16b, v10.16b, v2.16b
14767fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v2.16b
14867fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v11, RMAC)
14967fa3a7fSTianjia Zhang	eor		v11.16b, v11.16b, v3.16b
15067fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v3.16b
15167fa3a7fSTianjia Zhang
15267fa3a7fSTianjia Zhang	st1		{v8.16b-v11.16b}, [x1], #64
15367fa3a7fSTianjia Zhang
15467fa3a7fSTianjia Zhang	cbz		w4, .Lccm_enc_end
15567fa3a7fSTianjia Zhang	b		.Lccm_enc_loop_4x
15667fa3a7fSTianjia Zhang
15767fa3a7fSTianjia Zhang.Lccm_enc_loop_1x:
15867fa3a7fSTianjia Zhang	cmp		w4, #16
15967fa3a7fSTianjia Zhang	blt		.Lccm_enc_tail
16067fa3a7fSTianjia Zhang
16167fa3a7fSTianjia Zhang	sub		w4, w4, #16
16267fa3a7fSTianjia Zhang
16367fa3a7fSTianjia Zhang	/* construct CTRs */
16467fa3a7fSTianjia Zhang	inc_le128(v8)
16567fa3a7fSTianjia Zhang
16667fa3a7fSTianjia Zhang	ld1		{v0.16b}, [x2], #16
16767fa3a7fSTianjia Zhang
16867fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v8, RMAC)
16967fa3a7fSTianjia Zhang	eor		v8.16b, v8.16b, v0.16b
17067fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v0.16b
17167fa3a7fSTianjia Zhang
17267fa3a7fSTianjia Zhang	st1		{v8.16b}, [x1], #16
17367fa3a7fSTianjia Zhang
17467fa3a7fSTianjia Zhang	cbz		w4, .Lccm_enc_end
17567fa3a7fSTianjia Zhang	b		.Lccm_enc_loop_1x
17667fa3a7fSTianjia Zhang
17767fa3a7fSTianjia Zhang.Lccm_enc_tail:
17867fa3a7fSTianjia Zhang	/* construct CTRs */
17967fa3a7fSTianjia Zhang	inc_le128(v8)
18067fa3a7fSTianjia Zhang
18167fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(RMAC, v8)
18267fa3a7fSTianjia Zhang
18367fa3a7fSTianjia Zhang	/* store new MAC */
18467fa3a7fSTianjia Zhang	st1		{RMAC.16b}, [x5]
18567fa3a7fSTianjia Zhang
18667fa3a7fSTianjia Zhang.Lccm_enc_tail_loop:
18767fa3a7fSTianjia Zhang	ldrb		w0, [x2], #1		/* get 1 byte from input */
18867fa3a7fSTianjia Zhang	umov		w9, v8.b[0]		/* get top crypted CTR byte */
18967fa3a7fSTianjia Zhang	umov		w6, RMAC.b[0]		/* get top MAC byte */
19067fa3a7fSTianjia Zhang
19167fa3a7fSTianjia Zhang	eor		w9, w9, w0		/* w9 = CTR ^ input */
19267fa3a7fSTianjia Zhang	eor		w6, w6, w0		/* w6 = MAC ^ input */
19367fa3a7fSTianjia Zhang
19467fa3a7fSTianjia Zhang	strb		w9, [x1], #1		/* store out byte */
19567fa3a7fSTianjia Zhang	strb		w6, [x5], #1		/* store MAC byte */
19667fa3a7fSTianjia Zhang
19767fa3a7fSTianjia Zhang	subs		w4, w4, #1
19867fa3a7fSTianjia Zhang	beq		.Lccm_enc_ret
19967fa3a7fSTianjia Zhang
20067fa3a7fSTianjia Zhang	/* shift out one byte */
20167fa3a7fSTianjia Zhang	ext		RMAC.16b, RMAC.16b, RMAC.16b, #1
20267fa3a7fSTianjia Zhang	ext		v8.16b, v8.16b, v8.16b, #1
20367fa3a7fSTianjia Zhang
20467fa3a7fSTianjia Zhang	b		.Lccm_enc_tail_loop
20567fa3a7fSTianjia Zhang
20667fa3a7fSTianjia Zhang.Lccm_enc_end:
20767fa3a7fSTianjia Zhang	/* store new MAC */
20867fa3a7fSTianjia Zhang	st1		{RMAC.16b}, [x5]
20967fa3a7fSTianjia Zhang
21067fa3a7fSTianjia Zhang	/* store new CTR */
21167fa3a7fSTianjia Zhang	rev		x7, x7
21267fa3a7fSTianjia Zhang	rev		x8, x8
21367fa3a7fSTianjia Zhang	stp		x7, x8, [x3]
21467fa3a7fSTianjia Zhang
21567fa3a7fSTianjia Zhang.Lccm_enc_ret:
21667fa3a7fSTianjia Zhang	ret
21767fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_ccm_enc)
21867fa3a7fSTianjia Zhang
21967fa3a7fSTianjia Zhang.align 3
220*736f8868STianjia ZhangSYM_TYPED_FUNC_START(sm4_ce_ccm_dec)
22167fa3a7fSTianjia Zhang	/* input:
22267fa3a7fSTianjia Zhang	 *   x0: round key array, CTX
22367fa3a7fSTianjia Zhang	 *   x1: dst
22467fa3a7fSTianjia Zhang	 *   x2: src
22567fa3a7fSTianjia Zhang	 *   x3: ctr (big endian, 128 bit)
22667fa3a7fSTianjia Zhang	 *   w4: nbytes
22767fa3a7fSTianjia Zhang	 *   x5: mac
22867fa3a7fSTianjia Zhang	 */
22967fa3a7fSTianjia Zhang	SM4_PREPARE(x0)
23067fa3a7fSTianjia Zhang
23167fa3a7fSTianjia Zhang	ldp		x7, x8, [x3]
23267fa3a7fSTianjia Zhang	rev		x7, x7
23367fa3a7fSTianjia Zhang	rev		x8, x8
23467fa3a7fSTianjia Zhang
23567fa3a7fSTianjia Zhang	ld1		{RMAC.16b}, [x5]
23667fa3a7fSTianjia Zhang
23767fa3a7fSTianjia Zhang.Lccm_dec_loop_4x:
23867fa3a7fSTianjia Zhang	cmp		w4, #(4 * 16)
23967fa3a7fSTianjia Zhang	blt		.Lccm_dec_loop_1x
24067fa3a7fSTianjia Zhang
24167fa3a7fSTianjia Zhang	sub		w4, w4, #(4 * 16)
24267fa3a7fSTianjia Zhang
24367fa3a7fSTianjia Zhang	/* construct CTRs */
24467fa3a7fSTianjia Zhang	inc_le128(v8)			/* +0 */
24567fa3a7fSTianjia Zhang	inc_le128(v9)			/* +1 */
24667fa3a7fSTianjia Zhang	inc_le128(v10)			/* +2 */
24767fa3a7fSTianjia Zhang	inc_le128(v11)			/* +3 */
24867fa3a7fSTianjia Zhang
24967fa3a7fSTianjia Zhang	ld1		{v0.16b-v3.16b}, [x2], #64
25067fa3a7fSTianjia Zhang
25167fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v8, RMAC)
25267fa3a7fSTianjia Zhang	eor		v8.16b, v8.16b, v0.16b
25367fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v8.16b
25467fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v9, RMAC)
25567fa3a7fSTianjia Zhang	eor		v9.16b, v9.16b, v1.16b
25667fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v9.16b
25767fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v10, RMAC)
25867fa3a7fSTianjia Zhang	eor		v10.16b, v10.16b, v2.16b
25967fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v10.16b
26067fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v11, RMAC)
26167fa3a7fSTianjia Zhang	eor		v11.16b, v11.16b, v3.16b
26267fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v11.16b
26367fa3a7fSTianjia Zhang
26467fa3a7fSTianjia Zhang	st1		{v8.16b-v11.16b}, [x1], #64
26567fa3a7fSTianjia Zhang
26667fa3a7fSTianjia Zhang	cbz		w4, .Lccm_dec_end
26767fa3a7fSTianjia Zhang	b		.Lccm_dec_loop_4x
26867fa3a7fSTianjia Zhang
26967fa3a7fSTianjia Zhang.Lccm_dec_loop_1x:
27067fa3a7fSTianjia Zhang	cmp		w4, #16
27167fa3a7fSTianjia Zhang	blt		.Lccm_dec_tail
27267fa3a7fSTianjia Zhang
27367fa3a7fSTianjia Zhang	sub		w4, w4, #16
27467fa3a7fSTianjia Zhang
27567fa3a7fSTianjia Zhang	/* construct CTRs */
27667fa3a7fSTianjia Zhang	inc_le128(v8)
27767fa3a7fSTianjia Zhang
27867fa3a7fSTianjia Zhang	ld1		{v0.16b}, [x2], #16
27967fa3a7fSTianjia Zhang
28067fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(v8, RMAC)
28167fa3a7fSTianjia Zhang	eor		v8.16b, v8.16b, v0.16b
28267fa3a7fSTianjia Zhang	eor		RMAC.16b, RMAC.16b, v8.16b
28367fa3a7fSTianjia Zhang
28467fa3a7fSTianjia Zhang	st1		{v8.16b}, [x1], #16
28567fa3a7fSTianjia Zhang
28667fa3a7fSTianjia Zhang	cbz		w4, .Lccm_dec_end
28767fa3a7fSTianjia Zhang	b		.Lccm_dec_loop_1x
28867fa3a7fSTianjia Zhang
28967fa3a7fSTianjia Zhang.Lccm_dec_tail:
29067fa3a7fSTianjia Zhang	/* construct CTRs */
29167fa3a7fSTianjia Zhang	inc_le128(v8)
29267fa3a7fSTianjia Zhang
29367fa3a7fSTianjia Zhang	SM4_CRYPT_BLK2(RMAC, v8)
29467fa3a7fSTianjia Zhang
29567fa3a7fSTianjia Zhang	/* store new MAC */
29667fa3a7fSTianjia Zhang	st1		{RMAC.16b}, [x5]
29767fa3a7fSTianjia Zhang
29867fa3a7fSTianjia Zhang.Lccm_dec_tail_loop:
29967fa3a7fSTianjia Zhang	ldrb		w0, [x2], #1		/* get 1 byte from input */
30067fa3a7fSTianjia Zhang	umov		w9, v8.b[0]		/* get top crypted CTR byte */
30167fa3a7fSTianjia Zhang	umov		w6, RMAC.b[0]		/* get top MAC byte */
30267fa3a7fSTianjia Zhang
30367fa3a7fSTianjia Zhang	eor		w9, w9, w0		/* w9 = CTR ^ input */
30467fa3a7fSTianjia Zhang	eor		w6, w6, w9		/* w6 = MAC ^ output */
30567fa3a7fSTianjia Zhang
30667fa3a7fSTianjia Zhang	strb		w9, [x1], #1		/* store out byte */
30767fa3a7fSTianjia Zhang	strb		w6, [x5], #1		/* store MAC byte */
30867fa3a7fSTianjia Zhang
30967fa3a7fSTianjia Zhang	subs		w4, w4, #1
31067fa3a7fSTianjia Zhang	beq		.Lccm_dec_ret
31167fa3a7fSTianjia Zhang
31267fa3a7fSTianjia Zhang	/* shift out one byte */
31367fa3a7fSTianjia Zhang	ext		RMAC.16b, RMAC.16b, RMAC.16b, #1
31467fa3a7fSTianjia Zhang	ext		v8.16b, v8.16b, v8.16b, #1
31567fa3a7fSTianjia Zhang
31667fa3a7fSTianjia Zhang	b		.Lccm_dec_tail_loop
31767fa3a7fSTianjia Zhang
31867fa3a7fSTianjia Zhang.Lccm_dec_end:
31967fa3a7fSTianjia Zhang	/* store new MAC */
32067fa3a7fSTianjia Zhang	st1		{RMAC.16b}, [x5]
32167fa3a7fSTianjia Zhang
32267fa3a7fSTianjia Zhang	/* store new CTR */
32367fa3a7fSTianjia Zhang	rev		x7, x7
32467fa3a7fSTianjia Zhang	rev		x8, x8
32567fa3a7fSTianjia Zhang	stp		x7, x8, [x3]
32667fa3a7fSTianjia Zhang
32767fa3a7fSTianjia Zhang.Lccm_dec_ret:
32867fa3a7fSTianjia Zhang	ret
32967fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_ccm_dec)
330