xref: /openbmc/linux/arch/arm64/crypto/sm4-ce-core.S (revision acf50233)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14.arch	armv8-a+crypto
15
16.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
17	.set .Lv\b\().4s, \b
18.endr
19
20.macro sm4e, vd, vn
21	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
22.endm
23
24.macro sm4ekey, vd, vn, vm
25	.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
26.endm
27
28/* Register macros */
29
30#define RTMP0	v16
31#define RTMP1	v17
32#define RTMP2	v18
33#define RTMP3	v19
34
35#define RIV	v20
36
37/* Helper macros. */
38
39#define PREPARE                                       \
40	ld1		{v24.16b-v27.16b}, [x0], #64; \
41	ld1		{v28.16b-v31.16b}, [x0];
42
43#define SM4_CRYPT_BLK(b0)                           \
44	rev32		b0.16b, b0.16b;             \
45	sm4e		b0.4s, v24.4s;              \
46	sm4e		b0.4s, v25.4s;              \
47	sm4e		b0.4s, v26.4s;              \
48	sm4e		b0.4s, v27.4s;              \
49	sm4e		b0.4s, v28.4s;              \
50	sm4e		b0.4s, v29.4s;              \
51	sm4e		b0.4s, v30.4s;              \
52	sm4e		b0.4s, v31.4s;              \
53	rev64		b0.4s, b0.4s;               \
54	ext		b0.16b, b0.16b, b0.16b, #8; \
55	rev32		b0.16b, b0.16b;
56
57#define SM4_CRYPT_BLK4(b0, b1, b2, b3)              \
58	rev32		b0.16b, b0.16b;             \
59	rev32		b1.16b, b1.16b;             \
60	rev32		b2.16b, b2.16b;             \
61	rev32		b3.16b, b3.16b;             \
62	sm4e		b0.4s, v24.4s;              \
63	sm4e		b1.4s, v24.4s;              \
64	sm4e		b2.4s, v24.4s;              \
65	sm4e		b3.4s, v24.4s;              \
66	sm4e		b0.4s, v25.4s;              \
67	sm4e		b1.4s, v25.4s;              \
68	sm4e		b2.4s, v25.4s;              \
69	sm4e		b3.4s, v25.4s;              \
70	sm4e		b0.4s, v26.4s;              \
71	sm4e		b1.4s, v26.4s;              \
72	sm4e		b2.4s, v26.4s;              \
73	sm4e		b3.4s, v26.4s;              \
74	sm4e		b0.4s, v27.4s;              \
75	sm4e		b1.4s, v27.4s;              \
76	sm4e		b2.4s, v27.4s;              \
77	sm4e		b3.4s, v27.4s;              \
78	sm4e		b0.4s, v28.4s;              \
79	sm4e		b1.4s, v28.4s;              \
80	sm4e		b2.4s, v28.4s;              \
81	sm4e		b3.4s, v28.4s;              \
82	sm4e		b0.4s, v29.4s;              \
83	sm4e		b1.4s, v29.4s;              \
84	sm4e		b2.4s, v29.4s;              \
85	sm4e		b3.4s, v29.4s;              \
86	sm4e		b0.4s, v30.4s;              \
87	sm4e		b1.4s, v30.4s;              \
88	sm4e		b2.4s, v30.4s;              \
89	sm4e		b3.4s, v30.4s;              \
90	sm4e		b0.4s, v31.4s;              \
91	sm4e		b1.4s, v31.4s;              \
92	sm4e		b2.4s, v31.4s;              \
93	sm4e		b3.4s, v31.4s;              \
94	rev64		b0.4s, b0.4s;               \
95	rev64		b1.4s, b1.4s;               \
96	rev64		b2.4s, b2.4s;               \
97	rev64		b3.4s, b3.4s;               \
98	ext		b0.16b, b0.16b, b0.16b, #8; \
99	ext		b1.16b, b1.16b, b1.16b, #8; \
100	ext		b2.16b, b2.16b, b2.16b, #8; \
101	ext		b3.16b, b3.16b, b3.16b, #8; \
102	rev32		b0.16b, b0.16b;             \
103	rev32		b1.16b, b1.16b;             \
104	rev32		b2.16b, b2.16b;             \
105	rev32		b3.16b, b3.16b;
106
107#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
108	rev32		b0.16b, b0.16b;             \
109	rev32		b1.16b, b1.16b;             \
110	rev32		b2.16b, b2.16b;             \
111	rev32		b3.16b, b3.16b;             \
112	rev32		b4.16b, b4.16b;             \
113	rev32		b5.16b, b5.16b;             \
114	rev32		b6.16b, b6.16b;             \
115	rev32		b7.16b, b7.16b;             \
116	sm4e		b0.4s, v24.4s;              \
117	sm4e		b1.4s, v24.4s;              \
118	sm4e		b2.4s, v24.4s;              \
119	sm4e		b3.4s, v24.4s;              \
120	sm4e		b4.4s, v24.4s;              \
121	sm4e		b5.4s, v24.4s;              \
122	sm4e		b6.4s, v24.4s;              \
123	sm4e		b7.4s, v24.4s;              \
124	sm4e		b0.4s, v25.4s;              \
125	sm4e		b1.4s, v25.4s;              \
126	sm4e		b2.4s, v25.4s;              \
127	sm4e		b3.4s, v25.4s;              \
128	sm4e		b4.4s, v25.4s;              \
129	sm4e		b5.4s, v25.4s;              \
130	sm4e		b6.4s, v25.4s;              \
131	sm4e		b7.4s, v25.4s;              \
132	sm4e		b0.4s, v26.4s;              \
133	sm4e		b1.4s, v26.4s;              \
134	sm4e		b2.4s, v26.4s;              \
135	sm4e		b3.4s, v26.4s;              \
136	sm4e		b4.4s, v26.4s;              \
137	sm4e		b5.4s, v26.4s;              \
138	sm4e		b6.4s, v26.4s;              \
139	sm4e		b7.4s, v26.4s;              \
140	sm4e		b0.4s, v27.4s;              \
141	sm4e		b1.4s, v27.4s;              \
142	sm4e		b2.4s, v27.4s;              \
143	sm4e		b3.4s, v27.4s;              \
144	sm4e		b4.4s, v27.4s;              \
145	sm4e		b5.4s, v27.4s;              \
146	sm4e		b6.4s, v27.4s;              \
147	sm4e		b7.4s, v27.4s;              \
148	sm4e		b0.4s, v28.4s;              \
149	sm4e		b1.4s, v28.4s;              \
150	sm4e		b2.4s, v28.4s;              \
151	sm4e		b3.4s, v28.4s;              \
152	sm4e		b4.4s, v28.4s;              \
153	sm4e		b5.4s, v28.4s;              \
154	sm4e		b6.4s, v28.4s;              \
155	sm4e		b7.4s, v28.4s;              \
156	sm4e		b0.4s, v29.4s;              \
157	sm4e		b1.4s, v29.4s;              \
158	sm4e		b2.4s, v29.4s;              \
159	sm4e		b3.4s, v29.4s;              \
160	sm4e		b4.4s, v29.4s;              \
161	sm4e		b5.4s, v29.4s;              \
162	sm4e		b6.4s, v29.4s;              \
163	sm4e		b7.4s, v29.4s;              \
164	sm4e		b0.4s, v30.4s;              \
165	sm4e		b1.4s, v30.4s;              \
166	sm4e		b2.4s, v30.4s;              \
167	sm4e		b3.4s, v30.4s;              \
168	sm4e		b4.4s, v30.4s;              \
169	sm4e		b5.4s, v30.4s;              \
170	sm4e		b6.4s, v30.4s;              \
171	sm4e		b7.4s, v30.4s;              \
172	sm4e		b0.4s, v31.4s;              \
173	sm4e		b1.4s, v31.4s;              \
174	sm4e		b2.4s, v31.4s;              \
175	sm4e		b3.4s, v31.4s;              \
176	sm4e		b4.4s, v31.4s;              \
177	sm4e		b5.4s, v31.4s;              \
178	sm4e		b6.4s, v31.4s;              \
179	sm4e		b7.4s, v31.4s;              \
180	rev64		b0.4s, b0.4s;               \
181	rev64		b1.4s, b1.4s;               \
182	rev64		b2.4s, b2.4s;               \
183	rev64		b3.4s, b3.4s;               \
184	rev64		b4.4s, b4.4s;               \
185	rev64		b5.4s, b5.4s;               \
186	rev64		b6.4s, b6.4s;               \
187	rev64		b7.4s, b7.4s;               \
188	ext		b0.16b, b0.16b, b0.16b, #8; \
189	ext		b1.16b, b1.16b, b1.16b, #8; \
190	ext		b2.16b, b2.16b, b2.16b, #8; \
191	ext		b3.16b, b3.16b, b3.16b, #8; \
192	ext		b4.16b, b4.16b, b4.16b, #8; \
193	ext		b5.16b, b5.16b, b5.16b, #8; \
194	ext		b6.16b, b6.16b, b6.16b, #8; \
195	ext		b7.16b, b7.16b, b7.16b, #8; \
196	rev32		b0.16b, b0.16b;             \
197	rev32		b1.16b, b1.16b;             \
198	rev32		b2.16b, b2.16b;             \
199	rev32		b3.16b, b3.16b;             \
200	rev32		b4.16b, b4.16b;             \
201	rev32		b5.16b, b5.16b;             \
202	rev32		b6.16b, b6.16b;             \
203	rev32		b7.16b, b7.16b;
204
205
206.align 3
207SYM_FUNC_START(sm4_ce_expand_key)
208	/* input:
209	 *   x0: 128-bit key
210	 *   x1: rkey_enc
211	 *   x2: rkey_dec
212	 *   x3: fk array
213	 *   x4: ck array
214	 */
215	ld1		{v0.16b}, [x0];
216	rev32		v0.16b, v0.16b;
217	ld1		{v1.16b}, [x3];
218	/* load ck */
219	ld1		{v24.16b-v27.16b}, [x4], #64;
220	ld1		{v28.16b-v31.16b}, [x4];
221
222	/* input ^ fk */
223	eor		v0.16b, v0.16b, v1.16b;
224
225	sm4ekey		v0.4s, v0.4s, v24.4s;
226	sm4ekey		v1.4s, v0.4s, v25.4s;
227	sm4ekey		v2.4s, v1.4s, v26.4s;
228	sm4ekey		v3.4s, v2.4s, v27.4s;
229	sm4ekey		v4.4s, v3.4s, v28.4s;
230	sm4ekey		v5.4s, v4.4s, v29.4s;
231	sm4ekey		v6.4s, v5.4s, v30.4s;
232	sm4ekey		v7.4s, v6.4s, v31.4s;
233
234	st1		{v0.16b-v3.16b}, [x1], #64;
235	st1		{v4.16b-v7.16b}, [x1];
236	rev64		v7.4s, v7.4s;
237	rev64		v6.4s, v6.4s;
238	rev64		v5.4s, v5.4s;
239	rev64		v4.4s, v4.4s;
240	rev64		v3.4s, v3.4s;
241	rev64		v2.4s, v2.4s;
242	rev64		v1.4s, v1.4s;
243	rev64		v0.4s, v0.4s;
244	ext		v7.16b, v7.16b, v7.16b, #8;
245	ext		v6.16b, v6.16b, v6.16b, #8;
246	ext		v5.16b, v5.16b, v5.16b, #8;
247	ext		v4.16b, v4.16b, v4.16b, #8;
248	ext		v3.16b, v3.16b, v3.16b, #8;
249	ext		v2.16b, v2.16b, v2.16b, #8;
250	ext		v1.16b, v1.16b, v1.16b, #8;
251	ext		v0.16b, v0.16b, v0.16b, #8;
252	st1		{v7.16b}, [x2], #16;
253	st1		{v6.16b}, [x2], #16;
254	st1		{v5.16b}, [x2], #16;
255	st1		{v4.16b}, [x2], #16;
256	st1		{v3.16b}, [x2], #16;
257	st1		{v2.16b}, [x2], #16;
258	st1		{v1.16b}, [x2], #16;
259	st1		{v0.16b}, [x2];
260
261	ret;
262SYM_FUNC_END(sm4_ce_expand_key)
263
264.align 3
265SYM_FUNC_START(sm4_ce_crypt_block)
266	/* input:
267	 *   x0: round key array, CTX
268	 *   x1: dst
269	 *   x2: src
270	 */
271	PREPARE;
272
273	ld1		{v0.16b}, [x2];
274	SM4_CRYPT_BLK(v0);
275	st1		{v0.16b}, [x1];
276
277	ret;
278SYM_FUNC_END(sm4_ce_crypt_block)
279
280.align 3
281SYM_FUNC_START(sm4_ce_crypt)
282	/* input:
283	 *   x0: round key array, CTX
284	 *   x1: dst
285	 *   x2: src
286	 *   w3: nblocks
287	 */
288	PREPARE;
289
290.Lcrypt_loop_blk:
291	sub		w3, w3, #8;
292	tbnz		w3, #31, .Lcrypt_tail8;
293
294	ld1		{v0.16b-v3.16b}, [x2], #64;
295	ld1		{v4.16b-v7.16b}, [x2], #64;
296
297	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
298
299	st1		{v0.16b-v3.16b}, [x1], #64;
300	st1		{v4.16b-v7.16b}, [x1], #64;
301
302	cbz		w3, .Lcrypt_end;
303	b		.Lcrypt_loop_blk;
304
305.Lcrypt_tail8:
306	add		w3, w3, #8;
307	cmp		w3, #4;
308	blt		.Lcrypt_tail4;
309
310	sub		w3, w3, #4;
311
312	ld1		{v0.16b-v3.16b}, [x2], #64;
313	SM4_CRYPT_BLK4(v0, v1, v2, v3);
314	st1		{v0.16b-v3.16b}, [x1], #64;
315
316	cbz		w3, .Lcrypt_end;
317
318.Lcrypt_tail4:
319	sub		w3, w3, #1;
320
321	ld1		{v0.16b}, [x2], #16;
322	SM4_CRYPT_BLK(v0);
323	st1		{v0.16b}, [x1], #16;
324
325	cbnz		w3, .Lcrypt_tail4;
326
327.Lcrypt_end:
328	ret;
329SYM_FUNC_END(sm4_ce_crypt)
330
331.align 3
332SYM_FUNC_START(sm4_ce_cbc_enc)
333	/* input:
334	 *   x0: round key array, CTX
335	 *   x1: dst
336	 *   x2: src
337	 *   x3: iv (big endian, 128 bit)
338	 *   w4: nblocks
339	 */
340	PREPARE;
341
342	ld1		{RIV.16b}, [x3];
343
344.Lcbc_enc_loop:
345	sub		w4, w4, #1;
346
347	ld1		{RTMP0.16b}, [x2], #16;
348	eor		RIV.16b, RIV.16b, RTMP0.16b;
349
350	SM4_CRYPT_BLK(RIV);
351
352	st1		{RIV.16b}, [x1], #16;
353
354	cbnz		w4, .Lcbc_enc_loop;
355
356	/* store new IV */
357	st1		{RIV.16b}, [x3];
358
359	ret;
360SYM_FUNC_END(sm4_ce_cbc_enc)
361
362.align 3
363SYM_FUNC_START(sm4_ce_cbc_dec)
364	/* input:
365	 *   x0: round key array, CTX
366	 *   x1: dst
367	 *   x2: src
368	 *   x3: iv (big endian, 128 bit)
369	 *   w4: nblocks
370	 */
371	PREPARE;
372
373	ld1		{RIV.16b}, [x3];
374
375.Lcbc_loop_blk:
376	sub		w4, w4, #8;
377	tbnz		w4, #31, .Lcbc_tail8;
378
379	ld1		{v0.16b-v3.16b}, [x2], #64;
380	ld1		{v4.16b-v7.16b}, [x2];
381
382	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
383
384	sub		x2, x2, #64;
385	eor		v0.16b, v0.16b, RIV.16b;
386	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
387	eor		v1.16b, v1.16b, RTMP0.16b;
388	eor		v2.16b, v2.16b, RTMP1.16b;
389	eor		v3.16b, v3.16b, RTMP2.16b;
390	st1		{v0.16b-v3.16b}, [x1], #64;
391
392	eor		v4.16b, v4.16b, RTMP3.16b;
393	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
394	eor		v5.16b, v5.16b, RTMP0.16b;
395	eor		v6.16b, v6.16b, RTMP1.16b;
396	eor		v7.16b, v7.16b, RTMP2.16b;
397
398	mov		RIV.16b, RTMP3.16b;
399	st1		{v4.16b-v7.16b}, [x1], #64;
400
401	cbz		w4, .Lcbc_end;
402	b		.Lcbc_loop_blk;
403
404.Lcbc_tail8:
405	add		w4, w4, #8;
406	cmp		w4, #4;
407	blt		.Lcbc_tail4;
408
409	sub		w4, w4, #4;
410
411	ld1		{v0.16b-v3.16b}, [x2];
412
413	SM4_CRYPT_BLK4(v0, v1, v2, v3);
414
415	eor		v0.16b, v0.16b, RIV.16b;
416	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
417	eor		v1.16b, v1.16b, RTMP0.16b;
418	eor		v2.16b, v2.16b, RTMP1.16b;
419	eor		v3.16b, v3.16b, RTMP2.16b;
420
421	mov		RIV.16b, RTMP3.16b;
422	st1		{v0.16b-v3.16b}, [x1], #64;
423
424	cbz		w4, .Lcbc_end;
425
426.Lcbc_tail4:
427	sub		w4, w4, #1;
428
429	ld1		{v0.16b}, [x2];
430
431	SM4_CRYPT_BLK(v0);
432
433	eor		v0.16b, v0.16b, RIV.16b;
434	ld1		{RIV.16b}, [x2], #16;
435	st1		{v0.16b}, [x1], #16;
436
437	cbnz		w4, .Lcbc_tail4;
438
439.Lcbc_end:
440	/* store new IV */
441	st1		{RIV.16b}, [x3];
442
443	ret;
444SYM_FUNC_END(sm4_ce_cbc_dec)
445
446.align 3
447SYM_FUNC_START(sm4_ce_cfb_enc)
448	/* input:
449	 *   x0: round key array, CTX
450	 *   x1: dst
451	 *   x2: src
452	 *   x3: iv (big endian, 128 bit)
453	 *   w4: nblocks
454	 */
455	PREPARE;
456
457	ld1		{RIV.16b}, [x3];
458
459.Lcfb_enc_loop:
460	sub		w4, w4, #1;
461
462	SM4_CRYPT_BLK(RIV);
463
464	ld1		{RTMP0.16b}, [x2], #16;
465	eor		RIV.16b, RIV.16b, RTMP0.16b;
466	st1		{RIV.16b}, [x1], #16;
467
468	cbnz		w4, .Lcfb_enc_loop;
469
470	/* store new IV */
471	st1		{RIV.16b}, [x3];
472
473	ret;
474SYM_FUNC_END(sm4_ce_cfb_enc)
475
476.align 3
477SYM_FUNC_START(sm4_ce_cfb_dec)
478	/* input:
479	 *   x0: round key array, CTX
480	 *   x1: dst
481	 *   x2: src
482	 *   x3: iv (big endian, 128 bit)
483	 *   w4: nblocks
484	 */
485	PREPARE;
486
487	ld1		{v0.16b}, [x3];
488
489.Lcfb_loop_blk:
490	sub		w4, w4, #8;
491	tbnz		w4, #31, .Lcfb_tail8;
492
493	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
494	ld1		{v4.16b-v7.16b}, [x2];
495
496	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
497
498	sub		x2, x2, #48;
499	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
500	eor		v0.16b, v0.16b, RTMP0.16b;
501	eor		v1.16b, v1.16b, RTMP1.16b;
502	eor		v2.16b, v2.16b, RTMP2.16b;
503	eor		v3.16b, v3.16b, RTMP3.16b;
504	st1		{v0.16b-v3.16b}, [x1], #64;
505
506	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
507	eor		v4.16b, v4.16b, RTMP0.16b;
508	eor		v5.16b, v5.16b, RTMP1.16b;
509	eor		v6.16b, v6.16b, RTMP2.16b;
510	eor		v7.16b, v7.16b, RTMP3.16b;
511	st1		{v4.16b-v7.16b}, [x1], #64;
512
513	mov		v0.16b, RTMP3.16b;
514
515	cbz		w4, .Lcfb_end;
516	b		.Lcfb_loop_blk;
517
518.Lcfb_tail8:
519	add		w4, w4, #8;
520	cmp		w4, #4;
521	blt		.Lcfb_tail4;
522
523	sub		w4, w4, #4;
524
525	ld1		{v1.16b, v2.16b, v3.16b}, [x2];
526
527	SM4_CRYPT_BLK4(v0, v1, v2, v3);
528
529	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
530	eor		v0.16b, v0.16b, RTMP0.16b;
531	eor		v1.16b, v1.16b, RTMP1.16b;
532	eor		v2.16b, v2.16b, RTMP2.16b;
533	eor		v3.16b, v3.16b, RTMP3.16b;
534	st1		{v0.16b-v3.16b}, [x1], #64;
535
536	mov		v0.16b, RTMP3.16b;
537
538	cbz		w4, .Lcfb_end;
539
540.Lcfb_tail4:
541	sub		w4, w4, #1;
542
543	SM4_CRYPT_BLK(v0);
544
545	ld1		{RTMP0.16b}, [x2], #16;
546	eor		v0.16b, v0.16b, RTMP0.16b;
547	st1		{v0.16b}, [x1], #16;
548
549	mov		v0.16b, RTMP0.16b;
550
551	cbnz		w4, .Lcfb_tail4;
552
553.Lcfb_end:
554	/* store new IV */
555	st1		{v0.16b}, [x3];
556
557	ret;
558SYM_FUNC_END(sm4_ce_cfb_dec)
559
560.align 3
561SYM_FUNC_START(sm4_ce_ctr_enc)
562	/* input:
563	 *   x0: round key array, CTX
564	 *   x1: dst
565	 *   x2: src
566	 *   x3: ctr (big endian, 128 bit)
567	 *   w4: nblocks
568	 */
569	PREPARE;
570
571	ldp		x7, x8, [x3];
572	rev		x7, x7;
573	rev		x8, x8;
574
575.Lctr_loop_blk:
576	sub		w4, w4, #8;
577	tbnz		w4, #31, .Lctr_tail8;
578
579#define inc_le128(vctr)                     \
580	mov		vctr.d[1], x8;      \
581	mov		vctr.d[0], x7;      \
582	adds		x8, x8, #1;         \
583	adc		x7, x7, xzr;        \
584	rev64		vctr.16b, vctr.16b;
585
586	/* construct CTRs */
587	inc_le128(v0);			/* +0 */
588	inc_le128(v1);			/* +1 */
589	inc_le128(v2);			/* +2 */
590	inc_le128(v3);			/* +3 */
591	inc_le128(v4);			/* +4 */
592	inc_le128(v5);			/* +5 */
593	inc_le128(v6);			/* +6 */
594	inc_le128(v7);			/* +7 */
595
596	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
597
598	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
599	eor		v0.16b, v0.16b, RTMP0.16b;
600	eor		v1.16b, v1.16b, RTMP1.16b;
601	eor		v2.16b, v2.16b, RTMP2.16b;
602	eor		v3.16b, v3.16b, RTMP3.16b;
603	st1		{v0.16b-v3.16b}, [x1], #64;
604
605	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
606	eor		v4.16b, v4.16b, RTMP0.16b;
607	eor		v5.16b, v5.16b, RTMP1.16b;
608	eor		v6.16b, v6.16b, RTMP2.16b;
609	eor		v7.16b, v7.16b, RTMP3.16b;
610	st1		{v4.16b-v7.16b}, [x1], #64;
611
612	cbz		w4, .Lctr_end;
613	b		.Lctr_loop_blk;
614
615.Lctr_tail8:
616	add		w4, w4, #8;
617	cmp		w4, #4;
618	blt		.Lctr_tail4;
619
620	sub		w4, w4, #4;
621
622	/* construct CTRs */
623	inc_le128(v0);			/* +0 */
624	inc_le128(v1);			/* +1 */
625	inc_le128(v2);			/* +2 */
626	inc_le128(v3);			/* +3 */
627
628	SM4_CRYPT_BLK4(v0, v1, v2, v3);
629
630	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
631	eor		v0.16b, v0.16b, RTMP0.16b;
632	eor		v1.16b, v1.16b, RTMP1.16b;
633	eor		v2.16b, v2.16b, RTMP2.16b;
634	eor		v3.16b, v3.16b, RTMP3.16b;
635	st1		{v0.16b-v3.16b}, [x1], #64;
636
637	cbz		w4, .Lctr_end;
638
639.Lctr_tail4:
640	sub		w4, w4, #1;
641
642	/* construct CTRs */
643	inc_le128(v0);
644
645	SM4_CRYPT_BLK(v0);
646
647	ld1		{RTMP0.16b}, [x2], #16;
648	eor		v0.16b, v0.16b, RTMP0.16b;
649	st1		{v0.16b}, [x1], #16;
650
651	cbnz		w4, .Lctr_tail4;
652
653.Lctr_end:
654	/* store new CTR */
655	rev		x7, x7;
656	rev		x8, x8;
657	stp		x7, x8, [x3];
658
659	ret;
660SYM_FUNC_END(sm4_ce_ctr_enc)
661