xref: /openbmc/linux/arch/arm64/crypto/sm4-ce-core.S (revision 25879d7b)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13#include "sm4-ce-asm.h"
14
15.arch	armv8-a+crypto
16
17.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18		20, 24, 25, 26, 27, 28, 29, 30, 31
19	.set .Lv\b\().4s, \b
20.endr
21
22.macro sm4e, vd, vn
23	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24.endm
25
26.macro sm4ekey, vd, vn, vm
27	.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28.endm
29
30/* Register macros */
31
32#define RTMP0	v16
33#define RTMP1	v17
34#define RTMP2	v18
35#define RTMP3	v19
36
37#define RIV	v20
38#define RMAC	v20
39#define RMASK	v21
40
41
42.align 3
43SYM_FUNC_START(sm4_ce_expand_key)
44	/* input:
45	 *   x0: 128-bit key
46	 *   x1: rkey_enc
47	 *   x2: rkey_dec
48	 *   x3: fk array
49	 *   x4: ck array
50	 */
51	ld1		{v0.16b}, [x0];
52	rev32		v0.16b, v0.16b;
53	ld1		{v1.16b}, [x3];
54	/* load ck */
55	ld1		{v24.16b-v27.16b}, [x4], #64;
56	ld1		{v28.16b-v31.16b}, [x4];
57
58	/* input ^ fk */
59	eor		v0.16b, v0.16b, v1.16b;
60
61	sm4ekey		v0.4s, v0.4s, v24.4s;
62	sm4ekey		v1.4s, v0.4s, v25.4s;
63	sm4ekey		v2.4s, v1.4s, v26.4s;
64	sm4ekey		v3.4s, v2.4s, v27.4s;
65	sm4ekey		v4.4s, v3.4s, v28.4s;
66	sm4ekey		v5.4s, v4.4s, v29.4s;
67	sm4ekey		v6.4s, v5.4s, v30.4s;
68	sm4ekey		v7.4s, v6.4s, v31.4s;
69
70	adr_l		x5, .Lbswap128_mask
71	ld1		{v24.16b}, [x5]
72
73	st1		{v0.16b-v3.16b}, [x1], #64;
74	st1		{v4.16b-v7.16b}, [x1];
75
76	tbl		v16.16b, {v7.16b}, v24.16b
77	tbl		v17.16b, {v6.16b}, v24.16b
78	tbl		v18.16b, {v5.16b}, v24.16b
79	tbl		v19.16b, {v4.16b}, v24.16b
80	tbl		v20.16b, {v3.16b}, v24.16b
81	tbl		v21.16b, {v2.16b}, v24.16b
82	tbl		v22.16b, {v1.16b}, v24.16b
83	tbl		v23.16b, {v0.16b}, v24.16b
84
85	st1		{v16.16b-v19.16b}, [x2], #64
86	st1		{v20.16b-v23.16b}, [x2]
87
88	ret;
89SYM_FUNC_END(sm4_ce_expand_key)
90
91.align 3
92SYM_FUNC_START(sm4_ce_crypt_block)
93	/* input:
94	 *   x0: round key array, CTX
95	 *   x1: dst
96	 *   x2: src
97	 */
98	SM4_PREPARE(x0)
99
100	ld1		{v0.16b}, [x2];
101	SM4_CRYPT_BLK(v0);
102	st1		{v0.16b}, [x1];
103
104	ret;
105SYM_FUNC_END(sm4_ce_crypt_block)
106
107.align 3
108SYM_FUNC_START(sm4_ce_crypt)
109	/* input:
110	 *   x0: round key array, CTX
111	 *   x1: dst
112	 *   x2: src
113	 *   w3: nblocks
114	 */
115	SM4_PREPARE(x0)
116
117.Lcrypt_loop_blk:
118	sub		w3, w3, #8;
119	tbnz		w3, #31, .Lcrypt_tail8;
120
121	ld1		{v0.16b-v3.16b}, [x2], #64;
122	ld1		{v4.16b-v7.16b}, [x2], #64;
123
124	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125
126	st1		{v0.16b-v3.16b}, [x1], #64;
127	st1		{v4.16b-v7.16b}, [x1], #64;
128
129	cbz		w3, .Lcrypt_end;
130	b		.Lcrypt_loop_blk;
131
132.Lcrypt_tail8:
133	add		w3, w3, #8;
134	cmp		w3, #4;
135	blt		.Lcrypt_tail4;
136
137	sub		w3, w3, #4;
138
139	ld1		{v0.16b-v3.16b}, [x2], #64;
140	SM4_CRYPT_BLK4(v0, v1, v2, v3);
141	st1		{v0.16b-v3.16b}, [x1], #64;
142
143	cbz		w3, .Lcrypt_end;
144
145.Lcrypt_tail4:
146	sub		w3, w3, #1;
147
148	ld1		{v0.16b}, [x2], #16;
149	SM4_CRYPT_BLK(v0);
150	st1		{v0.16b}, [x1], #16;
151
152	cbnz		w3, .Lcrypt_tail4;
153
154.Lcrypt_end:
155	ret;
156SYM_FUNC_END(sm4_ce_crypt)
157
158.align 3
159SYM_FUNC_START(sm4_ce_cbc_enc)
160	/* input:
161	 *   x0: round key array, CTX
162	 *   x1: dst
163	 *   x2: src
164	 *   x3: iv (big endian, 128 bit)
165	 *   w4: nblocks
166	 */
167	SM4_PREPARE(x0)
168
169	ld1		{RIV.16b}, [x3]
170
171.Lcbc_enc_loop_4x:
172	cmp		w4, #4
173	blt		.Lcbc_enc_loop_1x
174
175	sub		w4, w4, #4
176
177	ld1		{v0.16b-v3.16b}, [x2], #64
178
179	eor		v0.16b, v0.16b, RIV.16b
180	SM4_CRYPT_BLK(v0)
181	eor		v1.16b, v1.16b, v0.16b
182	SM4_CRYPT_BLK(v1)
183	eor		v2.16b, v2.16b, v1.16b
184	SM4_CRYPT_BLK(v2)
185	eor		v3.16b, v3.16b, v2.16b
186	SM4_CRYPT_BLK(v3)
187
188	st1		{v0.16b-v3.16b}, [x1], #64
189	mov		RIV.16b, v3.16b
190
191	cbz		w4, .Lcbc_enc_end
192	b		.Lcbc_enc_loop_4x
193
194.Lcbc_enc_loop_1x:
195	sub		w4, w4, #1
196
197	ld1		{v0.16b}, [x2], #16
198
199	eor		RIV.16b, RIV.16b, v0.16b
200	SM4_CRYPT_BLK(RIV)
201
202	st1		{RIV.16b}, [x1], #16
203
204	cbnz		w4, .Lcbc_enc_loop_1x
205
206.Lcbc_enc_end:
207	/* store new IV */
208	st1		{RIV.16b}, [x3]
209
210	ret
211SYM_FUNC_END(sm4_ce_cbc_enc)
212
213.align 3
214SYM_FUNC_START(sm4_ce_cbc_dec)
215	/* input:
216	 *   x0: round key array, CTX
217	 *   x1: dst
218	 *   x2: src
219	 *   x3: iv (big endian, 128 bit)
220	 *   w4: nblocks
221	 */
222	SM4_PREPARE(x0)
223
224	ld1		{RIV.16b}, [x3]
225
226.Lcbc_dec_loop_8x:
227	sub		w4, w4, #8
228	tbnz		w4, #31, .Lcbc_dec_4x
229
230	ld1		{v0.16b-v3.16b}, [x2], #64
231	ld1		{v4.16b-v7.16b}, [x2], #64
232
233	rev32		v8.16b, v0.16b
234	rev32		v9.16b, v1.16b
235	rev32		v10.16b, v2.16b
236	rev32		v11.16b, v3.16b
237	rev32		v12.16b, v4.16b
238	rev32		v13.16b, v5.16b
239	rev32		v14.16b, v6.16b
240	rev32		v15.16b, v7.16b
241
242	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243
244	eor		v8.16b, v8.16b, RIV.16b
245	eor		v9.16b, v9.16b, v0.16b
246	eor		v10.16b, v10.16b, v1.16b
247	eor		v11.16b, v11.16b, v2.16b
248	eor		v12.16b, v12.16b, v3.16b
249	eor		v13.16b, v13.16b, v4.16b
250	eor		v14.16b, v14.16b, v5.16b
251	eor		v15.16b, v15.16b, v6.16b
252
253	st1		{v8.16b-v11.16b}, [x1], #64
254	st1		{v12.16b-v15.16b}, [x1], #64
255
256	mov		RIV.16b, v7.16b
257
258	cbz		w4, .Lcbc_dec_end
259	b		.Lcbc_dec_loop_8x
260
261.Lcbc_dec_4x:
262	add		w4, w4, #8
263	cmp		w4, #4
264	blt		.Lcbc_dec_loop_1x
265
266	sub		w4, w4, #4
267
268	ld1		{v0.16b-v3.16b}, [x2], #64
269
270	rev32		v8.16b, v0.16b
271	rev32		v9.16b, v1.16b
272	rev32		v10.16b, v2.16b
273	rev32		v11.16b, v3.16b
274
275	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276
277	eor		v8.16b, v8.16b, RIV.16b
278	eor		v9.16b, v9.16b, v0.16b
279	eor		v10.16b, v10.16b, v1.16b
280	eor		v11.16b, v11.16b, v2.16b
281
282	st1		{v8.16b-v11.16b}, [x1], #64
283
284	mov		RIV.16b, v3.16b
285
286	cbz		w4, .Lcbc_dec_end
287
288.Lcbc_dec_loop_1x:
289	sub		w4, w4, #1
290
291	ld1		{v0.16b}, [x2], #16
292
293	rev32		v8.16b, v0.16b
294
295	SM4_CRYPT_BLK_BE(v8)
296
297	eor		v8.16b, v8.16b, RIV.16b
298	st1		{v8.16b}, [x1], #16
299
300	mov		RIV.16b, v0.16b
301
302	cbnz		w4, .Lcbc_dec_loop_1x
303
304.Lcbc_dec_end:
305	/* store new IV */
306	st1		{RIV.16b}, [x3]
307
308	ret
309SYM_FUNC_END(sm4_ce_cbc_dec)
310
311.align 3
312SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313	/* input:
314	 *   x0: round key array, CTX
315	 *   x1: dst
316	 *   x2: src
317	 *   x3: iv (big endian, 128 bit)
318	 *   w4: nbytes
319	 */
320	SM4_PREPARE(x0)
321
322	sub		w5, w4, #16
323	uxtw		x5, w5
324
325	ld1		{RIV.16b}, [x3]
326
327	ld1		{v0.16b}, [x2]
328	eor		RIV.16b, RIV.16b, v0.16b
329	SM4_CRYPT_BLK(RIV)
330
331	/* load permute table */
332	adr_l		x6, .Lcts_permute_table
333	add		x7, x6, #32
334	add		x6, x6, x5
335	sub		x7, x7, x5
336	ld1		{v3.16b}, [x6]
337	ld1		{v4.16b}, [x7]
338
339	/* overlapping loads */
340	add		x2, x2, x5
341	ld1		{v1.16b}, [x2]
342
343	/* create Cn from En-1 */
344	tbl		v0.16b, {RIV.16b}, v3.16b
345	/* padding Pn with zeros */
346	tbl		v1.16b, {v1.16b}, v4.16b
347
348	eor		v1.16b, v1.16b, RIV.16b
349	SM4_CRYPT_BLK(v1)
350
351	/* overlapping stores */
352	add		x5, x1, x5
353	st1		{v0.16b}, [x5]
354	st1		{v1.16b}, [x1]
355
356	ret
357SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358
359.align 3
360SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361	/* input:
362	 *   x0: round key array, CTX
363	 *   x1: dst
364	 *   x2: src
365	 *   x3: iv (big endian, 128 bit)
366	 *   w4: nbytes
367	 */
368	SM4_PREPARE(x0)
369
370	sub		w5, w4, #16
371	uxtw		x5, w5
372
373	ld1		{RIV.16b}, [x3]
374
375	/* load permute table */
376	adr_l		x6, .Lcts_permute_table
377	add		x7, x6, #32
378	add		x6, x6, x5
379	sub		x7, x7, x5
380	ld1		{v3.16b}, [x6]
381	ld1		{v4.16b}, [x7]
382
383	/* overlapping loads */
384	ld1		{v0.16b}, [x2], x5
385	ld1		{v1.16b}, [x2]
386
387	SM4_CRYPT_BLK(v0)
388	/* select the first Ln bytes of Xn to create Pn */
389	tbl		v2.16b, {v0.16b}, v3.16b
390	eor		v2.16b, v2.16b, v1.16b
391
392	/* overwrite the first Ln bytes with Cn to create En-1 */
393	tbx		v0.16b, {v1.16b}, v4.16b
394	SM4_CRYPT_BLK(v0)
395	eor		v0.16b, v0.16b, RIV.16b
396
397	/* overlapping stores */
398	add		x5, x1, x5
399	st1		{v2.16b}, [x5]
400	st1		{v0.16b}, [x1]
401
402	ret
403SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404
405.align 3
406SYM_FUNC_START(sm4_ce_cfb_enc)
407	/* input:
408	 *   x0: round key array, CTX
409	 *   x1: dst
410	 *   x2: src
411	 *   x3: iv (big endian, 128 bit)
412	 *   w4: nblocks
413	 */
414	SM4_PREPARE(x0)
415
416	ld1		{RIV.16b}, [x3]
417
418.Lcfb_enc_loop_4x:
419	cmp		w4, #4
420	blt		.Lcfb_enc_loop_1x
421
422	sub		w4, w4, #4
423
424	ld1		{v0.16b-v3.16b}, [x2], #64
425
426	rev32		v8.16b, RIV.16b
427	SM4_CRYPT_BLK_BE(v8)
428	eor		v0.16b, v0.16b, v8.16b
429
430	rev32		v8.16b, v0.16b
431	SM4_CRYPT_BLK_BE(v8)
432	eor		v1.16b, v1.16b, v8.16b
433
434	rev32		v8.16b, v1.16b
435	SM4_CRYPT_BLK_BE(v8)
436	eor		v2.16b, v2.16b, v8.16b
437
438	rev32		v8.16b, v2.16b
439	SM4_CRYPT_BLK_BE(v8)
440	eor		v3.16b, v3.16b, v8.16b
441
442	st1		{v0.16b-v3.16b}, [x1], #64
443	mov		RIV.16b, v3.16b
444
445	cbz		w4, .Lcfb_enc_end
446	b		.Lcfb_enc_loop_4x
447
448.Lcfb_enc_loop_1x:
449	sub		w4, w4, #1
450
451	ld1		{v0.16b}, [x2], #16
452
453	SM4_CRYPT_BLK(RIV)
454	eor		RIV.16b, RIV.16b, v0.16b
455
456	st1		{RIV.16b}, [x1], #16
457
458	cbnz		w4, .Lcfb_enc_loop_1x
459
460.Lcfb_enc_end:
461	/* store new IV */
462	st1		{RIV.16b}, [x3]
463
464	ret
465SYM_FUNC_END(sm4_ce_cfb_enc)
466
467.align 3
468SYM_FUNC_START(sm4_ce_cfb_dec)
469	/* input:
470	 *   x0: round key array, CTX
471	 *   x1: dst
472	 *   x2: src
473	 *   x3: iv (big endian, 128 bit)
474	 *   w4: nblocks
475	 */
476	SM4_PREPARE(x0)
477
478	ld1		{RIV.16b}, [x3]
479
480.Lcfb_dec_loop_8x:
481	sub		w4, w4, #8
482	tbnz		w4, #31, .Lcfb_dec_4x
483
484	ld1		{v0.16b-v3.16b}, [x2], #64
485	ld1		{v4.16b-v7.16b}, [x2], #64
486
487	rev32		v8.16b, RIV.16b
488	rev32		v9.16b, v0.16b
489	rev32		v10.16b, v1.16b
490	rev32		v11.16b, v2.16b
491	rev32		v12.16b, v3.16b
492	rev32		v13.16b, v4.16b
493	rev32		v14.16b, v5.16b
494	rev32		v15.16b, v6.16b
495
496	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
497
498	mov		RIV.16b, v7.16b
499
500	eor		v0.16b, v0.16b, v8.16b
501	eor		v1.16b, v1.16b, v9.16b
502	eor		v2.16b, v2.16b, v10.16b
503	eor		v3.16b, v3.16b, v11.16b
504	eor		v4.16b, v4.16b, v12.16b
505	eor		v5.16b, v5.16b, v13.16b
506	eor		v6.16b, v6.16b, v14.16b
507	eor		v7.16b, v7.16b, v15.16b
508
509	st1		{v0.16b-v3.16b}, [x1], #64
510	st1		{v4.16b-v7.16b}, [x1], #64
511
512	cbz		w4, .Lcfb_dec_end
513	b		.Lcfb_dec_loop_8x
514
515.Lcfb_dec_4x:
516	add		w4, w4, #8
517	cmp		w4, #4
518	blt		.Lcfb_dec_loop_1x
519
520	sub		w4, w4, #4
521
522	ld1		{v0.16b-v3.16b}, [x2], #64
523
524	rev32		v8.16b, RIV.16b
525	rev32		v9.16b, v0.16b
526	rev32		v10.16b, v1.16b
527	rev32		v11.16b, v2.16b
528
529	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
530
531	mov		RIV.16b, v3.16b
532
533	eor		v0.16b, v0.16b, v8.16b
534	eor		v1.16b, v1.16b, v9.16b
535	eor		v2.16b, v2.16b, v10.16b
536	eor		v3.16b, v3.16b, v11.16b
537
538	st1		{v0.16b-v3.16b}, [x1], #64
539
540	cbz		w4, .Lcfb_dec_end
541
542.Lcfb_dec_loop_1x:
543	sub		w4, w4, #1
544
545	ld1		{v0.16b}, [x2], #16
546
547	SM4_CRYPT_BLK(RIV)
548
549	eor		RIV.16b, RIV.16b, v0.16b
550	st1		{RIV.16b}, [x1], #16
551
552	mov		RIV.16b, v0.16b
553
554	cbnz		w4, .Lcfb_dec_loop_1x
555
556.Lcfb_dec_end:
557	/* store new IV */
558	st1		{RIV.16b}, [x3]
559
560	ret
561SYM_FUNC_END(sm4_ce_cfb_dec)
562
563.align 3
564SYM_FUNC_START(sm4_ce_ctr_enc)
565	/* input:
566	 *   x0: round key array, CTX
567	 *   x1: dst
568	 *   x2: src
569	 *   x3: ctr (big endian, 128 bit)
570	 *   w4: nblocks
571	 */
572	SM4_PREPARE(x0)
573
574	ldp		x7, x8, [x3]
575	rev		x7, x7
576	rev		x8, x8
577
578.Lctr_loop_8x:
579	sub		w4, w4, #8
580	tbnz		w4, #31, .Lctr_4x
581
582#define inc_le128(vctr)					\
583		mov		vctr.d[1], x8;		\
584		mov		vctr.d[0], x7;		\
585		adds		x8, x8, #1;		\
586		rev64		vctr.16b, vctr.16b;	\
587		adc		x7, x7, xzr;
588
589	/* construct CTRs */
590	inc_le128(v0)			/* +0 */
591	inc_le128(v1)			/* +1 */
592	inc_le128(v2)			/* +2 */
593	inc_le128(v3)			/* +3 */
594	inc_le128(v4)			/* +4 */
595	inc_le128(v5)			/* +5 */
596	inc_le128(v6)			/* +6 */
597	inc_le128(v7)			/* +7 */
598
599	ld1		{v8.16b-v11.16b}, [x2], #64
600	ld1		{v12.16b-v15.16b}, [x2], #64
601
602	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
603
604	eor		v0.16b, v0.16b, v8.16b
605	eor		v1.16b, v1.16b, v9.16b
606	eor		v2.16b, v2.16b, v10.16b
607	eor		v3.16b, v3.16b, v11.16b
608	eor		v4.16b, v4.16b, v12.16b
609	eor		v5.16b, v5.16b, v13.16b
610	eor		v6.16b, v6.16b, v14.16b
611	eor		v7.16b, v7.16b, v15.16b
612
613	st1		{v0.16b-v3.16b}, [x1], #64
614	st1		{v4.16b-v7.16b}, [x1], #64
615
616	cbz		w4, .Lctr_end
617	b		.Lctr_loop_8x
618
619.Lctr_4x:
620	add		w4, w4, #8
621	cmp		w4, #4
622	blt		.Lctr_loop_1x
623
624	sub		w4, w4, #4
625
626	/* construct CTRs */
627	inc_le128(v0)			/* +0 */
628	inc_le128(v1)			/* +1 */
629	inc_le128(v2)			/* +2 */
630	inc_le128(v3)			/* +3 */
631
632	ld1		{v8.16b-v11.16b}, [x2], #64
633
634	SM4_CRYPT_BLK4(v0, v1, v2, v3)
635
636	eor		v0.16b, v0.16b, v8.16b
637	eor		v1.16b, v1.16b, v9.16b
638	eor		v2.16b, v2.16b, v10.16b
639	eor		v3.16b, v3.16b, v11.16b
640
641	st1		{v0.16b-v3.16b}, [x1], #64
642
643	cbz		w4, .Lctr_end
644
645.Lctr_loop_1x:
646	sub		w4, w4, #1
647
648	/* construct CTRs */
649	inc_le128(v0)
650
651	ld1		{v8.16b}, [x2], #16
652
653	SM4_CRYPT_BLK(v0)
654
655	eor		v0.16b, v0.16b, v8.16b
656	st1		{v0.16b}, [x1], #16
657
658	cbnz		w4, .Lctr_loop_1x
659
660.Lctr_end:
661	/* store new CTR */
662	rev		x7, x7
663	rev		x8, x8
664	stp		x7, x8, [x3]
665
666	ret
667SYM_FUNC_END(sm4_ce_ctr_enc)
668
669
670#define tweak_next(vt, vin, RTMP)					\
671		sshr		RTMP.2d, vin.2d, #63;			\
672		and		RTMP.16b, RTMP.16b, RMASK.16b;		\
673		add		vt.2d, vin.2d, vin.2d;			\
674		ext		RTMP.16b, RTMP.16b, RTMP.16b, #8;	\
675		eor		vt.16b, vt.16b, RTMP.16b;
676
677.align 3
678SYM_FUNC_START(sm4_ce_xts_enc)
679	/* input:
680	 *   x0: round key array, CTX
681	 *   x1: dst
682	 *   x2: src
683	 *   x3: tweak (big endian, 128 bit)
684	 *   w4: nbytes
685	 *   x5: round key array for IV
686	 */
687	ld1		{v8.16b}, [x3]
688
689	cbz		x5, .Lxts_enc_nofirst
690
691	SM4_PREPARE(x5)
692
693	/* Generate first tweak */
694	SM4_CRYPT_BLK(v8)
695
696.Lxts_enc_nofirst:
697	SM4_PREPARE(x0)
698
699	ands		w5, w4, #15
700	lsr		w4, w4, #4
701	sub		w6, w4, #1
702	csel		w4, w4, w6, eq
703	uxtw		x5, w5
704
705	movi		RMASK.2s, #0x1
706	movi		RTMP0.2s, #0x87
707	uzp1		RMASK.4s, RMASK.4s, RTMP0.4s
708
709	cbz		w4, .Lxts_enc_cts
710
711.Lxts_enc_loop_8x:
712	sub		w4, w4, #8
713	tbnz		w4, #31, .Lxts_enc_4x
714
715	tweak_next( v9,  v8, RTMP0)
716	tweak_next(v10,  v9, RTMP1)
717	tweak_next(v11, v10, RTMP2)
718	tweak_next(v12, v11, RTMP3)
719	tweak_next(v13, v12, RTMP0)
720	tweak_next(v14, v13, RTMP1)
721	tweak_next(v15, v14, RTMP2)
722
723	ld1		{v0.16b-v3.16b}, [x2], #64
724	ld1		{v4.16b-v7.16b}, [x2], #64
725	eor		v0.16b, v0.16b,  v8.16b
726	eor		v1.16b, v1.16b,  v9.16b
727	eor		v2.16b, v2.16b, v10.16b
728	eor		v3.16b, v3.16b, v11.16b
729	eor		v4.16b, v4.16b, v12.16b
730	eor		v5.16b, v5.16b, v13.16b
731	eor		v6.16b, v6.16b, v14.16b
732	eor		v7.16b, v7.16b, v15.16b
733
734	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
735
736	eor		v0.16b, v0.16b,  v8.16b
737	eor		v1.16b, v1.16b,  v9.16b
738	eor		v2.16b, v2.16b, v10.16b
739	eor		v3.16b, v3.16b, v11.16b
740	eor		v4.16b, v4.16b, v12.16b
741	eor		v5.16b, v5.16b, v13.16b
742	eor		v6.16b, v6.16b, v14.16b
743	eor		v7.16b, v7.16b, v15.16b
744	st1		{v0.16b-v3.16b}, [x1], #64
745	st1		{v4.16b-v7.16b}, [x1], #64
746
747	tweak_next(v8, v15, RTMP3)
748
749	cbz		w4, .Lxts_enc_cts
750	b		.Lxts_enc_loop_8x
751
752.Lxts_enc_4x:
753	add		w4, w4, #8
754	cmp		w4, #4
755	blt		.Lxts_enc_loop_1x
756
757	sub		w4, w4, #4
758
759	tweak_next( v9,  v8, RTMP0)
760	tweak_next(v10,  v9, RTMP1)
761	tweak_next(v11, v10, RTMP2)
762
763	ld1		{v0.16b-v3.16b}, [x2], #64
764	eor		v0.16b, v0.16b,  v8.16b
765	eor		v1.16b, v1.16b,  v9.16b
766	eor		v2.16b, v2.16b, v10.16b
767	eor		v3.16b, v3.16b, v11.16b
768
769	SM4_CRYPT_BLK4(v0, v1, v2, v3)
770
771	eor		v0.16b, v0.16b,  v8.16b
772	eor		v1.16b, v1.16b,  v9.16b
773	eor		v2.16b, v2.16b, v10.16b
774	eor		v3.16b, v3.16b, v11.16b
775	st1		{v0.16b-v3.16b}, [x1], #64
776
777	tweak_next(v8, v11, RTMP3)
778
779	cbz		w4, .Lxts_enc_cts
780
781.Lxts_enc_loop_1x:
782	sub		w4, w4, #1
783
784	ld1		{v0.16b}, [x2], #16
785	eor		v0.16b, v0.16b, v8.16b
786
787	SM4_CRYPT_BLK(v0)
788
789	eor		v0.16b, v0.16b, v8.16b
790	st1		{v0.16b}, [x1], #16
791
792	tweak_next(v8, v8, RTMP0)
793
794	cbnz		w4, .Lxts_enc_loop_1x
795
796.Lxts_enc_cts:
797	cbz		x5, .Lxts_enc_end
798
799	/* cipher text stealing */
800
801	tweak_next(v9, v8, RTMP0)
802	ld1		{v0.16b}, [x2]
803	eor		v0.16b, v0.16b, v8.16b
804	SM4_CRYPT_BLK(v0)
805	eor		v0.16b, v0.16b, v8.16b
806
807	/* load permute table */
808	adr_l		x6, .Lcts_permute_table
809	add		x7, x6, #32
810	add		x6, x6, x5
811	sub		x7, x7, x5
812	ld1		{v3.16b}, [x6]
813	ld1		{v4.16b}, [x7]
814
815	/* overlapping loads */
816	add		x2, x2, x5
817	ld1		{v1.16b}, [x2]
818
819	/* create Cn from En-1 */
820	tbl		v2.16b, {v0.16b}, v3.16b
821	/* padding Pn with En-1 at the end */
822	tbx		v0.16b, {v1.16b}, v4.16b
823
824	eor		v0.16b, v0.16b, v9.16b
825	SM4_CRYPT_BLK(v0)
826	eor		v0.16b, v0.16b, v9.16b
827
828
829	/* overlapping stores */
830	add		x5, x1, x5
831	st1		{v2.16b}, [x5]
832	st1		{v0.16b}, [x1]
833
834	b		.Lxts_enc_ret
835
836.Lxts_enc_end:
837	/* store new tweak */
838	st1		{v8.16b}, [x3]
839
840.Lxts_enc_ret:
841	ret
842SYM_FUNC_END(sm4_ce_xts_enc)
843
844.align 3
845SYM_FUNC_START(sm4_ce_xts_dec)
846	/* input:
847	 *   x0: round key array, CTX
848	 *   x1: dst
849	 *   x2: src
850	 *   x3: tweak (big endian, 128 bit)
851	 *   w4: nbytes
852	 *   x5: round key array for IV
853	 */
854	ld1		{v8.16b}, [x3]
855
856	cbz		x5, .Lxts_dec_nofirst
857
858	SM4_PREPARE(x5)
859
860	/* Generate first tweak */
861	SM4_CRYPT_BLK(v8)
862
863.Lxts_dec_nofirst:
864	SM4_PREPARE(x0)
865
866	ands		w5, w4, #15
867	lsr		w4, w4, #4
868	sub		w6, w4, #1
869	csel		w4, w4, w6, eq
870	uxtw		x5, w5
871
872	movi		RMASK.2s, #0x1
873	movi		RTMP0.2s, #0x87
874	uzp1		RMASK.4s, RMASK.4s, RTMP0.4s
875
876	cbz		w4, .Lxts_dec_cts
877
878.Lxts_dec_loop_8x:
879	sub		w4, w4, #8
880	tbnz		w4, #31, .Lxts_dec_4x
881
882	tweak_next( v9,  v8, RTMP0)
883	tweak_next(v10,  v9, RTMP1)
884	tweak_next(v11, v10, RTMP2)
885	tweak_next(v12, v11, RTMP3)
886	tweak_next(v13, v12, RTMP0)
887	tweak_next(v14, v13, RTMP1)
888	tweak_next(v15, v14, RTMP2)
889
890	ld1		{v0.16b-v3.16b}, [x2], #64
891	ld1		{v4.16b-v7.16b}, [x2], #64
892	eor		v0.16b, v0.16b,  v8.16b
893	eor		v1.16b, v1.16b,  v9.16b
894	eor		v2.16b, v2.16b, v10.16b
895	eor		v3.16b, v3.16b, v11.16b
896	eor		v4.16b, v4.16b, v12.16b
897	eor		v5.16b, v5.16b, v13.16b
898	eor		v6.16b, v6.16b, v14.16b
899	eor		v7.16b, v7.16b, v15.16b
900
901	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
902
903	eor		v0.16b, v0.16b,  v8.16b
904	eor		v1.16b, v1.16b,  v9.16b
905	eor		v2.16b, v2.16b, v10.16b
906	eor		v3.16b, v3.16b, v11.16b
907	eor		v4.16b, v4.16b, v12.16b
908	eor		v5.16b, v5.16b, v13.16b
909	eor		v6.16b, v6.16b, v14.16b
910	eor		v7.16b, v7.16b, v15.16b
911	st1		{v0.16b-v3.16b}, [x1], #64
912	st1		{v4.16b-v7.16b}, [x1], #64
913
914	tweak_next(v8, v15, RTMP3)
915
916	cbz		w4, .Lxts_dec_cts
917	b		.Lxts_dec_loop_8x
918
919.Lxts_dec_4x:
920	add		w4, w4, #8
921	cmp		w4, #4
922	blt		.Lxts_dec_loop_1x
923
924	sub		w4, w4, #4
925
926	tweak_next( v9,  v8, RTMP0)
927	tweak_next(v10,  v9, RTMP1)
928	tweak_next(v11, v10, RTMP2)
929
930	ld1		{v0.16b-v3.16b}, [x2], #64
931	eor		v0.16b, v0.16b,  v8.16b
932	eor		v1.16b, v1.16b,  v9.16b
933	eor		v2.16b, v2.16b, v10.16b
934	eor		v3.16b, v3.16b, v11.16b
935
936	SM4_CRYPT_BLK4(v0, v1, v2, v3)
937
938	eor		v0.16b, v0.16b,  v8.16b
939	eor		v1.16b, v1.16b,  v9.16b
940	eor		v2.16b, v2.16b, v10.16b
941	eor		v3.16b, v3.16b, v11.16b
942	st1		{v0.16b-v3.16b}, [x1], #64
943
944	tweak_next(v8, v11, RTMP3)
945
946	cbz		w4, .Lxts_dec_cts
947
948.Lxts_dec_loop_1x:
949	sub		w4, w4, #1
950
951	ld1		{v0.16b}, [x2], #16
952	eor		v0.16b, v0.16b, v8.16b
953
954	SM4_CRYPT_BLK(v0)
955
956	eor		v0.16b, v0.16b, v8.16b
957	st1		{v0.16b}, [x1], #16
958
959	tweak_next(v8, v8, RTMP0)
960
961	cbnz		w4, .Lxts_dec_loop_1x
962
963.Lxts_dec_cts:
964	cbz		x5, .Lxts_dec_end
965
966	/* cipher text stealing */
967
968	tweak_next(v9, v8, RTMP0)
969	ld1		{v0.16b}, [x2]
970	eor		v0.16b, v0.16b, v9.16b
971	SM4_CRYPT_BLK(v0)
972	eor		v0.16b, v0.16b, v9.16b
973
974	/* load permute table */
975	adr_l		x6, .Lcts_permute_table
976	add		x7, x6, #32
977	add		x6, x6, x5
978	sub		x7, x7, x5
979	ld1		{v3.16b}, [x6]
980	ld1		{v4.16b}, [x7]
981
982	/* overlapping loads */
983	add		x2, x2, x5
984	ld1		{v1.16b}, [x2]
985
986	/* create Cn from En-1 */
987	tbl		v2.16b, {v0.16b}, v3.16b
988	/* padding Pn with En-1 at the end */
989	tbx		v0.16b, {v1.16b}, v4.16b
990
991	eor		v0.16b, v0.16b, v8.16b
992	SM4_CRYPT_BLK(v0)
993	eor		v0.16b, v0.16b, v8.16b
994
995
996	/* overlapping stores */
997	add		x5, x1, x5
998	st1		{v2.16b}, [x5]
999	st1		{v0.16b}, [x1]
1000
1001	b		.Lxts_dec_ret
1002
1003.Lxts_dec_end:
1004	/* store new tweak */
1005	st1		{v8.16b}, [x3]
1006
1007.Lxts_dec_ret:
1008	ret
1009SYM_FUNC_END(sm4_ce_xts_dec)
1010
1011.align 3
1012SYM_FUNC_START(sm4_ce_mac_update)
1013	/* input:
1014	 *   x0: round key array, CTX
1015	 *   x1: digest
1016	 *   x2: src
1017	 *   w3: nblocks
1018	 *   w4: enc_before
1019	 *   w5: enc_after
1020	 */
1021	SM4_PREPARE(x0)
1022
1023	ld1		{RMAC.16b}, [x1]
1024
1025	cbz		w4, .Lmac_update
1026
1027	SM4_CRYPT_BLK(RMAC)
1028
1029.Lmac_update:
1030	cbz		w3, .Lmac_ret
1031
1032	sub		w6, w3, #1
1033	cmp		w5, wzr
1034	csel		w3, w3, w6, ne
1035
1036	cbz		w3, .Lmac_end
1037
1038.Lmac_loop_4x:
1039	cmp		w3, #4
1040	blt		.Lmac_loop_1x
1041
1042	sub		w3, w3, #4
1043
1044	ld1		{v0.16b-v3.16b}, [x2], #64
1045
1046	eor		RMAC.16b, RMAC.16b, v0.16b
1047	SM4_CRYPT_BLK(RMAC)
1048	eor		RMAC.16b, RMAC.16b, v1.16b
1049	SM4_CRYPT_BLK(RMAC)
1050	eor		RMAC.16b, RMAC.16b, v2.16b
1051	SM4_CRYPT_BLK(RMAC)
1052	eor		RMAC.16b, RMAC.16b, v3.16b
1053	SM4_CRYPT_BLK(RMAC)
1054
1055	cbz		w3, .Lmac_end
1056	b		.Lmac_loop_4x
1057
1058.Lmac_loop_1x:
1059	sub		w3, w3, #1
1060
1061	ld1		{v0.16b}, [x2], #16
1062
1063	eor		RMAC.16b, RMAC.16b, v0.16b
1064	SM4_CRYPT_BLK(RMAC)
1065
1066	cbnz		w3, .Lmac_loop_1x
1067
1068
1069.Lmac_end:
1070	cbnz		w5, .Lmac_ret
1071
1072	ld1		{v0.16b}, [x2], #16
1073	eor		RMAC.16b, RMAC.16b, v0.16b
1074
1075.Lmac_ret:
1076	st1		{RMAC.16b}, [x1]
1077	ret
1078SYM_FUNC_END(sm4_ce_mac_update)
1079
1080
1081	.section	".rodata", "a"
1082	.align 4
1083.Lbswap128_mask:
1084	.byte		0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
1085	.byte		0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
1086
1087.Lcts_permute_table:
1088	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1089	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1090	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
1091	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
1092	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1093	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1094