xref: /openbmc/linux/arch/arm64/crypto/sm4-neon-core.S (revision 72ed5d5624af384eaf74d84915810d54486a75e2)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 NEON
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14/* Register macros */
15
16#define RTMP0	v8
17#define RTMP1	v9
18#define RTMP2	v10
19#define RTMP3	v11
20
21#define RTMP4	v12
22#define RTMP5	v13
23#define RTMP6	v14
24#define RTMP7	v15
25
26#define RX0	v12
27#define RX1	v13
28#define RKEY	v14
29#define RIV	v15
30
31/* Helper macros. */
32
33#define SM4_PREPARE()                                           \
34	adr_l		x5, crypto_sm4_sbox;                    \
35	ld1		{v16.16b-v19.16b}, [x5], #64;           \
36	ld1		{v20.16b-v23.16b}, [x5], #64;           \
37	ld1		{v24.16b-v27.16b}, [x5], #64;           \
38	ld1		{v28.16b-v31.16b}, [x5];
39
40#define transpose_4x4(s0, s1, s2, s3)                           \
41	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
42	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
43	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
44	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
45	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
46	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
47	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
48	zip2		s3.2d, RTMP2.2d, RTMP3.2d;
49
50#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7)        \
51	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
52	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
53	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
54	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
55	zip1		RTMP4.4s, s4.4s, s5.4s;                 \
56	zip1		RTMP5.4s, s6.4s, s7.4s;                 \
57	zip2		RTMP6.4s, s4.4s, s5.4s;                 \
58	zip2		RTMP7.4s, s6.4s, s7.4s;                 \
59	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
60	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
61	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
62	zip2		s3.2d, RTMP2.2d, RTMP3.2d;              \
63	zip1		s4.2d, RTMP4.2d, RTMP5.2d;              \
64	zip2		s5.2d, RTMP4.2d, RTMP5.2d;              \
65	zip1		s6.2d, RTMP6.2d, RTMP7.2d;              \
66	zip2		s7.2d, RTMP6.2d, RTMP7.2d;
67
68#define rotate_clockwise_4x4(s0, s1, s2, s3)                    \
69	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
70	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
71	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
72	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
73	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
74	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
75	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
76	zip2		s3.2d, RTMP3.2d, RTMP1.2d;
77
78#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
80	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
81	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
82	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
83	zip1		RTMP4.4s, s5.4s, s4.4s;                 \
84	zip1		RTMP6.4s, s7.4s, s6.4s;                 \
85	zip2		RTMP5.4s, s5.4s, s4.4s;                 \
86	zip2		RTMP7.4s, s7.4s, s6.4s;                 \
87	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
88	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
89	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
90	zip2		s3.2d, RTMP3.2d, RTMP1.2d;              \
91	zip1		s4.2d, RTMP6.2d, RTMP4.2d;              \
92	zip2		s5.2d, RTMP6.2d, RTMP4.2d;              \
93	zip1		s6.2d, RTMP7.2d, RTMP5.2d;              \
94	zip2		s7.2d, RTMP7.2d, RTMP5.2d;
95
96#define ROUND4(round, s0, s1, s2, s3)                           \
97	dup		RX0.4s, RKEY.s[round];                  \
98	/* rk ^ s1 ^ s2 ^ s3 */                                 \
99	eor		RTMP1.16b, s2.16b, s3.16b;              \
100	eor		RX0.16b, RX0.16b, s1.16b;               \
101	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
102                                                                \
103	/* sbox, non-linear part */                             \
104	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
105	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
106	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
107	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
108	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
109	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
110	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
111	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
112                                                                \
113	/* linear part */                                       \
114	shl		RTMP1.4s, RTMP0.4s, #8;                 \
115	shl		RTMP2.4s, RTMP0.4s, #16;                \
116	shl		RTMP3.4s, RTMP0.4s, #24;                \
117	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
118	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
119	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
120	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
121	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
122	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
123	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
124	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
125	shl		RTMP2.4s, RTMP1.4s, 2;                  \
126	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
127	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
128	/* s0 ^= RTMP3 */                                       \
129	eor		s0.16b, s0.16b, RTMP3.16b;
130
131#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)                       \
132	mov		x6, 8;                                  \
1334:                                                              \
134	ld1		{RKEY.4s}, [x0], #16;                   \
135	subs		x6, x6, #1;                             \
136                                                                \
137	ROUND4(0, b0, b1, b2, b3);                              \
138	ROUND4(1, b1, b2, b3, b0);                              \
139	ROUND4(2, b2, b3, b0, b1);                              \
140	ROUND4(3, b3, b0, b1, b2);                              \
141                                                                \
142	bne		4b;                                     \
143                                                                \
144	rev32		b0.16b, b0.16b;                         \
145	rev32		b1.16b, b1.16b;                         \
146	rev32		b2.16b, b2.16b;                         \
147	rev32		b3.16b, b3.16b;                         \
148                                                                \
149	rotate_clockwise_4x4(b0, b1, b2, b3);                   \
150                                                                \
151	/* repoint to rkey */                                   \
152	sub		x0, x0, #128;
153
154#define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
155	rev32		b0.16b, b0.16b;                         \
156	rev32		b1.16b, b1.16b;                         \
157	rev32		b2.16b, b2.16b;                         \
158	rev32		b3.16b, b3.16b;                         \
159	SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160
161#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
162	/* rk ^ s1 ^ s2 ^ s3 */                                 \
163	dup		RX0.4s, RKEY.s[round];                  \
164	eor		RTMP0.16b, s2.16b, s3.16b;              \
165	mov		RX1.16b, RX0.16b;                       \
166	eor		RTMP1.16b, t2.16b, t3.16b;              \
167	eor		RX0.16b, RX0.16b, s1.16b;               \
168	eor		RX1.16b, RX1.16b, t1.16b;               \
169	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
170	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
171                                                                \
172	/* sbox, non-linear part */                             \
173	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
174	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
175	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
176	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
177	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
178	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
179	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
180	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
181	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
182	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
183	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
184	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
185	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
186	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
187	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
188                                                                \
189	/* linear part */                                       \
190	shl		RX0.4s, RTMP0.4s, #8;                   \
191	shl		RX1.4s, RTMP1.4s, #8;                   \
192	shl		RTMP2.4s, RTMP0.4s, #16;                \
193	shl		RTMP3.4s, RTMP1.4s, #16;                \
194	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
195	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
196	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
197	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
198	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
199	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
200	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
201	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
202	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
203	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
204	shl		RTMP2.4s, RTMP0.4s, #24;                \
205	shl		RTMP3.4s, RTMP1.4s, #24;                \
206	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
207	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
208	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
209	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
210	shl		RTMP2.4s, RX0.4s, #2;                   \
211	shl		RTMP3.4s, RX1.4s, #2;                   \
212	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
213	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
214	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
215	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
216	/* s0/t0 ^= RTMP0/1 */                                  \
217	eor		s0.16b, s0.16b, RTMP0.16b;              \
218	eor		t0.16b, t0.16b, RTMP1.16b;
219
220#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221	rev32		b0.16b, b0.16b;                         \
222	rev32		b1.16b, b1.16b;                         \
223	rev32		b2.16b, b2.16b;                         \
224	rev32		b3.16b, b3.16b;                         \
225	rev32		b4.16b, b4.16b;                         \
226	rev32		b5.16b, b5.16b;                         \
227	rev32		b6.16b, b6.16b;                         \
228	rev32		b7.16b, b7.16b;                         \
229                                                                \
230	mov		x6, 8;                                  \
2318:                                                              \
232	ld1		{RKEY.4s}, [x0], #16;                   \
233	subs		x6, x6, #1;                             \
234                                                                \
235	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
236	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
237	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
238	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
239                                                                \
240	bne		8b;                                     \
241                                                                \
242	rev32		b0.16b, b0.16b;                         \
243	rev32		b1.16b, b1.16b;                         \
244	rev32		b2.16b, b2.16b;                         \
245	rev32		b3.16b, b3.16b;                         \
246	rev32		b4.16b, b4.16b;                         \
247	rev32		b5.16b, b5.16b;                         \
248	rev32		b6.16b, b6.16b;                         \
249	rev32		b7.16b, b7.16b;                         \
250                                                                \
251	/* repoint to rkey */                                   \
252	sub		x0, x0, #128;
253
254#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)			\
255	SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7);	\
256	rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7);	\
257
258
259.align 3
260SYM_FUNC_START(sm4_neon_crypt)
261	/* input:
262	 *   x0: round key array, CTX
263	 *   x1: dst
264	 *   x2: src
265	 *   w3: nblocks
266	 */
267	SM4_PREPARE()
268
269.Lcrypt_loop_8x:
270	sub		w3, w3, #8
271	tbnz		w3, #31, .Lcrypt_4x
272
273	ld4		{v0.4s-v3.4s}, [x2], #64
274	ld4		{v4.4s-v7.4s}, [x2], #64
275
276	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277
278	st1		{v0.16b-v3.16b}, [x1], #64
279	st1		{v4.16b-v7.16b}, [x1], #64
280
281	cbz		w3, .Lcrypt_end
282	b		.Lcrypt_loop_8x
283
284.Lcrypt_4x:
285	add		w3, w3, #8
286	cmp		w3, #4
287	blt		.Lcrypt_tail
288
289	sub		w3, w3, #4
290
291	ld4		{v0.4s-v3.4s}, [x2], #64
292
293	SM4_CRYPT_BLK4(v0, v1, v2, v3)
294
295	st1		{v0.16b-v3.16b}, [x1], #64
296
297	cbz		w3, .Lcrypt_end
298
299.Lcrypt_tail:
300	cmp		w3, #2
301	ld1		{v0.16b}, [x2], #16
302	blt		.Lcrypt_tail_load_done
303	ld1		{v1.16b}, [x2], #16
304	beq		.Lcrypt_tail_load_done
305	ld1		{v2.16b}, [x2], #16
306
307.Lcrypt_tail_load_done:
308	transpose_4x4(v0, v1, v2, v3)
309
310	SM4_CRYPT_BLK4(v0, v1, v2, v3)
311
312	cmp		w3, #2
313	st1		{v0.16b}, [x1], #16
314	blt		.Lcrypt_end
315	st1		{v1.16b}, [x1], #16
316	beq		.Lcrypt_end
317	st1		{v2.16b}, [x1], #16
318
319.Lcrypt_end:
320	ret
321SYM_FUNC_END(sm4_neon_crypt)
322
323.align 3
324SYM_FUNC_START(sm4_neon_cbc_dec)
325	/* input:
326	 *   x0: round key array, CTX
327	 *   x1: dst
328	 *   x2: src
329	 *   x3: iv (big endian, 128 bit)
330	 *   w4: nblocks
331	 */
332	SM4_PREPARE()
333
334	ld1		{RIV.16b}, [x3]
335
336.Lcbc_dec_loop_8x:
337	sub		w4, w4, #8
338	tbnz		w4, #31, .Lcbc_dec_4x
339
340	ld4		{v0.4s-v3.4s}, [x2], #64
341	ld4		{v4.4s-v7.4s}, [x2]
342
343	SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344
345	/* Avoid overwriting the RIV register */
346	rotate_clockwise_4x4(v0, v1, v2, v3)
347	rotate_clockwise_4x4(v4, v5, v6, v7)
348
349	sub		x2, x2, #64
350
351	eor		v0.16b, v0.16b, RIV.16b
352
353	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
354	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
355
356	eor		v1.16b, v1.16b, RTMP0.16b
357	eor		v2.16b, v2.16b, RTMP1.16b
358	eor		v3.16b, v3.16b, RTMP2.16b
359	eor		v4.16b, v4.16b, RTMP3.16b
360	eor		v5.16b, v5.16b, RTMP4.16b
361	eor		v6.16b, v6.16b, RTMP5.16b
362	eor		v7.16b, v7.16b, RTMP6.16b
363
364	mov		RIV.16b, RTMP7.16b
365
366	st1		{v0.16b-v3.16b}, [x1], #64
367	st1		{v4.16b-v7.16b}, [x1], #64
368
369	cbz		w4, .Lcbc_dec_end
370	b		.Lcbc_dec_loop_8x
371
372.Lcbc_dec_4x:
373	add		w4, w4, #8
374	cmp		w4, #4
375	blt		.Lcbc_dec_tail
376
377	sub		w4, w4, #4
378
379	ld1		{v0.16b-v3.16b}, [x2], #64
380
381	rev32		v4.16b, v0.16b
382	rev32		v5.16b, v1.16b
383	rev32		v6.16b, v2.16b
384	rev32		v7.16b, v3.16b
385
386	transpose_4x4(v4, v5, v6, v7)
387
388	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389
390	eor		v4.16b, v4.16b, RIV.16b
391	eor		v5.16b, v5.16b, v0.16b
392	eor		v6.16b, v6.16b, v1.16b
393	eor		v7.16b, v7.16b, v2.16b
394
395	mov		RIV.16b, v3.16b
396
397	st1		{v4.16b-v7.16b}, [x1], #64
398
399	cbz		w4, .Lcbc_dec_end
400
401.Lcbc_dec_tail:
402	cmp		w4, #2
403	ld1		{v0.16b}, [x2], #16
404	blt		.Lcbc_dec_tail_load_done
405	ld1		{v1.16b}, [x2], #16
406	beq		.Lcbc_dec_tail_load_done
407	ld1		{v2.16b}, [x2], #16
408
409.Lcbc_dec_tail_load_done:
410	rev32		v4.16b, v0.16b
411	rev32		v5.16b, v1.16b
412	rev32		v6.16b, v2.16b
413
414	transpose_4x4(v4, v5, v6, v7)
415
416	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417
418	cmp		w4, #2
419	eor		v4.16b, v4.16b, RIV.16b
420	mov		RIV.16b, v0.16b
421	st1		{v4.16b}, [x1], #16
422	blt		.Lcbc_dec_end
423
424	eor		v5.16b, v5.16b, v0.16b
425	mov		RIV.16b, v1.16b
426	st1		{v5.16b}, [x1], #16
427	beq		.Lcbc_dec_end
428
429	eor		v6.16b, v6.16b, v1.16b
430	mov		RIV.16b, v2.16b
431	st1		{v6.16b}, [x1], #16
432
433.Lcbc_dec_end:
434	/* store new IV */
435	st1		{RIV.16b}, [x3]
436
437	ret
438SYM_FUNC_END(sm4_neon_cbc_dec)
439
440.align 3
441SYM_FUNC_START(sm4_neon_cfb_dec)
442	/* input:
443	 *   x0: round key array, CTX
444	 *   x1: dst
445	 *   x2: src
446	 *   x3: iv (big endian, 128 bit)
447	 *   w4: nblocks
448	 */
449	SM4_PREPARE()
450
451	ld1		{v0.16b}, [x3]
452
453.Lcfb_dec_loop_8x:
454	sub		w4, w4, #8
455	tbnz		w4, #31, .Lcfb_dec_4x
456
457	ld1		{v1.16b-v3.16b}, [x2], #48
458	ld4		{v4.4s-v7.4s}, [x2]
459
460	transpose_4x4(v0, v1, v2, v3)
461
462	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
463
464	sub		x2, x2, #48
465	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
466	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
467
468	eor		v0.16b, v0.16b, RTMP0.16b
469	eor		v1.16b, v1.16b, RTMP1.16b
470	eor		v2.16b, v2.16b, RTMP2.16b
471	eor		v3.16b, v3.16b, RTMP3.16b
472	eor		v4.16b, v4.16b, RTMP4.16b
473	eor		v5.16b, v5.16b, RTMP5.16b
474	eor		v6.16b, v6.16b, RTMP6.16b
475	eor		v7.16b, v7.16b, RTMP7.16b
476
477	st1		{v0.16b-v3.16b}, [x1], #64
478	st1		{v4.16b-v7.16b}, [x1], #64
479
480	mov		v0.16b, RTMP7.16b
481
482	cbz		w4, .Lcfb_dec_end
483	b		.Lcfb_dec_loop_8x
484
485.Lcfb_dec_4x:
486	add		w4, w4, #8
487	cmp		w4, #4
488	blt		.Lcfb_dec_tail
489
490	sub		w4, w4, #4
491
492	ld1		{v4.16b-v7.16b}, [x2], #64
493
494	rev32		v0.16b, v0.16b		/* v0 is IV register */
495	rev32		v1.16b, v4.16b
496	rev32		v2.16b, v5.16b
497	rev32		v3.16b, v6.16b
498
499	transpose_4x4(v0, v1, v2, v3)
500
501	SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
502
503	eor		v0.16b, v0.16b, v4.16b
504	eor		v1.16b, v1.16b, v5.16b
505	eor		v2.16b, v2.16b, v6.16b
506	eor		v3.16b, v3.16b, v7.16b
507
508	st1		{v0.16b-v3.16b}, [x1], #64
509
510	mov		v0.16b, v7.16b
511
512	cbz		w4, .Lcfb_dec_end
513
514.Lcfb_dec_tail:
515	cmp		w4, #2
516	ld1		{v4.16b}, [x2], #16
517	blt		.Lcfb_dec_tail_load_done
518	ld1		{v5.16b}, [x2], #16
519	beq		.Lcfb_dec_tail_load_done
520	ld1		{v6.16b}, [x2], #16
521
522.Lcfb_dec_tail_load_done:
523	rev32		v0.16b, v0.16b		/* v0 is IV register */
524	rev32		v1.16b, v4.16b
525	rev32		v2.16b, v5.16b
526
527	transpose_4x4(v0, v1, v2, v3)
528
529	SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
530
531	cmp		w4, #2
532	eor		v0.16b, v0.16b, v4.16b
533	st1		{v0.16b}, [x1], #16
534	mov		v0.16b, v4.16b
535	blt		.Lcfb_dec_end
536
537	eor		v1.16b, v1.16b, v5.16b
538	st1		{v1.16b}, [x1], #16
539	mov		v0.16b, v5.16b
540	beq		.Lcfb_dec_end
541
542	eor		v2.16b, v2.16b, v6.16b
543	st1		{v2.16b}, [x1], #16
544	mov		v0.16b, v6.16b
545
546.Lcfb_dec_end:
547	/* store new IV */
548	st1		{v0.16b}, [x3]
549
550	ret
551SYM_FUNC_END(sm4_neon_cfb_dec)
552
553.align 3
554SYM_FUNC_START(sm4_neon_ctr_crypt)
555	/* input:
556	 *   x0: round key array, CTX
557	 *   x1: dst
558	 *   x2: src
559	 *   x3: ctr (big endian, 128 bit)
560	 *   w4: nblocks
561	 */
562	SM4_PREPARE()
563
564	ldp		x7, x8, [x3]
565	rev		x7, x7
566	rev		x8, x8
567
568.Lctr_crypt_loop_8x:
569	sub		w4, w4, #8
570	tbnz		w4, #31, .Lctr_crypt_4x
571
572#define inc_le128(vctr)                             \
573		mov		vctr.d[1], x8;      \
574		mov		vctr.d[0], x7;      \
575		adds		x8, x8, #1;         \
576		rev64		vctr.16b, vctr.16b; \
577		adc		x7, x7, xzr;
578
579	/* construct CTRs */
580	inc_le128(v0)			/* +0 */
581	inc_le128(v1)			/* +1 */
582	inc_le128(v2)			/* +2 */
583	inc_le128(v3)			/* +3 */
584	inc_le128(v4)			/* +4 */
585	inc_le128(v5)			/* +5 */
586	inc_le128(v6)			/* +6 */
587	inc_le128(v7)			/* +7 */
588
589	transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
590
591	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
592
593	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
594	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
595
596	eor		v0.16b, v0.16b, RTMP0.16b
597	eor		v1.16b, v1.16b, RTMP1.16b
598	eor		v2.16b, v2.16b, RTMP2.16b
599	eor		v3.16b, v3.16b, RTMP3.16b
600	eor		v4.16b, v4.16b, RTMP4.16b
601	eor		v5.16b, v5.16b, RTMP5.16b
602	eor		v6.16b, v6.16b, RTMP6.16b
603	eor		v7.16b, v7.16b, RTMP7.16b
604
605	st1		{v0.16b-v3.16b}, [x1], #64
606	st1		{v4.16b-v7.16b}, [x1], #64
607
608	cbz		w4, .Lctr_crypt_end
609	b		.Lctr_crypt_loop_8x
610
611.Lctr_crypt_4x:
612	add		w4, w4, #8
613	cmp		w4, #4
614	blt		.Lctr_crypt_tail
615
616	sub		w4, w4, #4
617
618	/* construct CTRs */
619	inc_le128(v0)			/* +0 */
620	inc_le128(v1)			/* +1 */
621	inc_le128(v2)			/* +2 */
622	inc_le128(v3)			/* +3 */
623
624	ld1		{v4.16b-v7.16b}, [x2], #64
625
626	transpose_4x4(v0, v1, v2, v3)
627
628	SM4_CRYPT_BLK4(v0, v1, v2, v3)
629
630	eor		v0.16b, v0.16b, v4.16b
631	eor		v1.16b, v1.16b, v5.16b
632	eor		v2.16b, v2.16b, v6.16b
633	eor		v3.16b, v3.16b, v7.16b
634
635	st1		{v0.16b-v3.16b}, [x1], #64
636
637	cbz		w4, .Lctr_crypt_end
638
639.Lctr_crypt_tail:
640	/* inc_le128 will change the sign bit */
641	ld1		{v4.16b}, [x2], #16
642	inc_le128(v0)
643	cmp		w4, #2
644	blt		.Lctr_crypt_tail_load_done
645
646	ld1		{v5.16b}, [x2], #16
647	inc_le128(v1)
648	cmp		w4, #2
649	beq		.Lctr_crypt_tail_load_done
650
651	ld1		{v6.16b}, [x2], #16
652	inc_le128(v2)
653
654.Lctr_crypt_tail_load_done:
655	transpose_4x4(v0, v1, v2, v3)
656
657	SM4_CRYPT_BLK4(v0, v1, v2, v3)
658
659	cmp		w4, #2
660
661	eor		v0.16b, v0.16b, v4.16b
662	st1		{v0.16b}, [x1], #16
663	blt		.Lctr_crypt_end
664
665	eor		v1.16b, v1.16b, v5.16b
666	st1		{v1.16b}, [x1], #16
667	beq		.Lctr_crypt_end
668
669	eor		v2.16b, v2.16b, v6.16b
670	st1		{v2.16b}, [x1], #16
671
672.Lctr_crypt_end:
673	/* store new CTR */
674	rev		x7, x7
675	rev		x8, x8
676	stp		x7, x8, [x3]
677
678	ret
679SYM_FUNC_END(sm4_neon_ctr_crypt)
680