1a41b2129STianjia Zhang// SPDX-License-Identifier: GPL-2.0-or-later
2a41b2129STianjia Zhang/*
3a41b2129STianjia Zhang * sm3-neon-core.S - SM3 secure hash using NEON instructions
4a41b2129STianjia Zhang *
5a41b2129STianjia Zhang * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6a41b2129STianjia Zhang *
7a41b2129STianjia Zhang * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8a41b2129STianjia Zhang * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9a41b2129STianjia Zhang */
10a41b2129STianjia Zhang
11a41b2129STianjia Zhang#include <linux/linkage.h>
12*be8f6b64SEric Biggers#include <linux/cfi_types.h>
13a41b2129STianjia Zhang#include <asm/assembler.h>
14a41b2129STianjia Zhang
15a41b2129STianjia Zhang/* Context structure */
16a41b2129STianjia Zhang
17a41b2129STianjia Zhang#define state_h0 0
18a41b2129STianjia Zhang#define state_h1 4
19a41b2129STianjia Zhang#define state_h2 8
20a41b2129STianjia Zhang#define state_h3 12
21a41b2129STianjia Zhang#define state_h4 16
22a41b2129STianjia Zhang#define state_h5 20
23a41b2129STianjia Zhang#define state_h6 24
24a41b2129STianjia Zhang#define state_h7 28
25a41b2129STianjia Zhang
26a41b2129STianjia Zhang/* Stack structure */
27a41b2129STianjia Zhang
28a41b2129STianjia Zhang#define STACK_W_SIZE        (32 * 2 * 3)
29a41b2129STianjia Zhang
30a41b2129STianjia Zhang#define STACK_W             (0)
31a41b2129STianjia Zhang#define STACK_SIZE          (STACK_W + STACK_W_SIZE)
32a41b2129STianjia Zhang
33a41b2129STianjia Zhang/* Register macros */
34a41b2129STianjia Zhang
35a41b2129STianjia Zhang#define RSTATE x0
36a41b2129STianjia Zhang#define RDATA  x1
37a41b2129STianjia Zhang#define RNBLKS x2
38a41b2129STianjia Zhang#define RKPTR  x28
39a41b2129STianjia Zhang#define RFRAME x29
40a41b2129STianjia Zhang
41a41b2129STianjia Zhang#define ra w3
42a41b2129STianjia Zhang#define rb w4
43a41b2129STianjia Zhang#define rc w5
44a41b2129STianjia Zhang#define rd w6
45a41b2129STianjia Zhang#define re w7
46a41b2129STianjia Zhang#define rf w8
47a41b2129STianjia Zhang#define rg w9
48a41b2129STianjia Zhang#define rh w10
49a41b2129STianjia Zhang
50a41b2129STianjia Zhang#define t0 w11
51a41b2129STianjia Zhang#define t1 w12
52a41b2129STianjia Zhang#define t2 w13
53a41b2129STianjia Zhang#define t3 w14
54a41b2129STianjia Zhang#define t4 w15
55a41b2129STianjia Zhang#define t5 w16
56a41b2129STianjia Zhang#define t6 w17
57a41b2129STianjia Zhang
58a41b2129STianjia Zhang#define k_even w19
59a41b2129STianjia Zhang#define k_odd w20
60a41b2129STianjia Zhang
61a41b2129STianjia Zhang#define addr0 x21
62a41b2129STianjia Zhang#define addr1 x22
63a41b2129STianjia Zhang
64a41b2129STianjia Zhang#define s0 w23
65a41b2129STianjia Zhang#define s1 w24
66a41b2129STianjia Zhang#define s2 w25
67a41b2129STianjia Zhang#define s3 w26
68a41b2129STianjia Zhang
69a41b2129STianjia Zhang#define W0 v0
70a41b2129STianjia Zhang#define W1 v1
71a41b2129STianjia Zhang#define W2 v2
72a41b2129STianjia Zhang#define W3 v3
73a41b2129STianjia Zhang#define W4 v4
74a41b2129STianjia Zhang#define W5 v5
75a41b2129STianjia Zhang
76a41b2129STianjia Zhang#define XTMP0 v6
77a41b2129STianjia Zhang#define XTMP1 v7
78a41b2129STianjia Zhang#define XTMP2 v16
79a41b2129STianjia Zhang#define XTMP3 v17
80a41b2129STianjia Zhang#define XTMP4 v18
81a41b2129STianjia Zhang#define XTMP5 v19
82a41b2129STianjia Zhang#define XTMP6 v20
83a41b2129STianjia Zhang
84a41b2129STianjia Zhang/* Helper macros. */
85a41b2129STianjia Zhang
86a41b2129STianjia Zhang#define _(...) /*_*/
87a41b2129STianjia Zhang
88a41b2129STianjia Zhang#define clear_vec(x) \
89a41b2129STianjia Zhang	movi	x.8h, #0;
90a41b2129STianjia Zhang
91a41b2129STianjia Zhang#define rolw(o, a, n) \
92a41b2129STianjia Zhang	ror	o, a, #(32 - n);
93a41b2129STianjia Zhang
94a41b2129STianjia Zhang/* Round function macros. */
95a41b2129STianjia Zhang
96a41b2129STianjia Zhang#define GG1_1(x, y, z, o, t) \
97a41b2129STianjia Zhang	eor	o, x, y;
98a41b2129STianjia Zhang#define GG1_2(x, y, z, o, t) \
99a41b2129STianjia Zhang	eor	o, o, z;
100a41b2129STianjia Zhang#define GG1_3(x, y, z, o, t)
101a41b2129STianjia Zhang
102a41b2129STianjia Zhang#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
103a41b2129STianjia Zhang#define FF1_2(x, y, z, o, t)
104a41b2129STianjia Zhang#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
105a41b2129STianjia Zhang
106a41b2129STianjia Zhang#define GG2_1(x, y, z, o, t) \
107a41b2129STianjia Zhang	bic	o, z, x;
108a41b2129STianjia Zhang#define GG2_2(x, y, z, o, t) \
109a41b2129STianjia Zhang	and	t, y, x;
110a41b2129STianjia Zhang#define GG2_3(x, y, z, o, t) \
111a41b2129STianjia Zhang	eor	o, o, t;
112a41b2129STianjia Zhang
113a41b2129STianjia Zhang#define FF2_1(x, y, z, o, t) \
114a41b2129STianjia Zhang	eor	o, x, y;
115a41b2129STianjia Zhang#define FF2_2(x, y, z, o, t) \
116a41b2129STianjia Zhang	and	t, x, y; \
117a41b2129STianjia Zhang	and	o, o, z;
118a41b2129STianjia Zhang#define FF2_3(x, y, z, o, t) \
119a41b2129STianjia Zhang	eor	o, o, t;
120a41b2129STianjia Zhang
121a41b2129STianjia Zhang#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
122a41b2129STianjia Zhang	K_LOAD(round);                                                        \
123a41b2129STianjia Zhang	ldr	t5, [sp, #(wtype##_W1_ADDR(round, widx))];                    \
124a41b2129STianjia Zhang	rolw(t0, a, 12);                              /* rol(a, 12) => t0 */  \
125a41b2129STianjia Zhang      IOP(1, iop_param);                                                      \
126a41b2129STianjia Zhang	FF##i##_1(a, b, c, t1, t2);                                           \
127a41b2129STianjia Zhang	ldr	t6, [sp, #(wtype##_W1W2_ADDR(round, widx))];                  \
128a41b2129STianjia Zhang	add	k, k, e;                                                      \
129a41b2129STianjia Zhang      IOP(2, iop_param);                                                      \
130a41b2129STianjia Zhang	GG##i##_1(e, f, g, t3, t4);                                           \
131a41b2129STianjia Zhang	FF##i##_2(a, b, c, t1, t2);                                           \
132a41b2129STianjia Zhang      IOP(3, iop_param);                                                      \
133a41b2129STianjia Zhang	add	k, k, t0;                                                     \
134a41b2129STianjia Zhang	add	h, h, t5;                                                     \
135a41b2129STianjia Zhang	add	d, d, t6;                     /* w1w2 + d => d */             \
136a41b2129STianjia Zhang      IOP(4, iop_param);                                                      \
137a41b2129STianjia Zhang	rolw(k, k, 7);                        /* rol (t0 + e + t), 7) => k */ \
138a41b2129STianjia Zhang	GG##i##_2(e, f, g, t3, t4);                                           \
139a41b2129STianjia Zhang	add	h, h, k;                      /* h + w1 + k => h */           \
140a41b2129STianjia Zhang      IOP(5, iop_param);                                                      \
141a41b2129STianjia Zhang	FF##i##_3(a, b, c, t1, t2);                                           \
142a41b2129STianjia Zhang	eor	t0, t0, k;                    /* k ^ t0 => t0 */              \
143a41b2129STianjia Zhang	GG##i##_3(e, f, g, t3, t4);                                           \
144a41b2129STianjia Zhang	add	d, d, t1;                     /* FF(a,b,c) + d => d */        \
145a41b2129STianjia Zhang      IOP(6, iop_param);                                                      \
146a41b2129STianjia Zhang	add	t3, t3, h;                    /* GG(e,f,g) + h => t3 */       \
147a41b2129STianjia Zhang	rolw(b, b, 9);                        /* rol(b, 9) => b */            \
148a41b2129STianjia Zhang	eor	h, t3, t3, ror #(32-9);                                       \
149a41b2129STianjia Zhang      IOP(7, iop_param);                                                      \
150a41b2129STianjia Zhang	add	d, d, t0;                     /* t0 + d => d */               \
151a41b2129STianjia Zhang	rolw(f, f, 19);                       /* rol(f, 19) => f */           \
152a41b2129STianjia Zhang      IOP(8, iop_param);                                                      \
153a41b2129STianjia Zhang	eor	h, h, t3, ror #(32-17);       /* P0(t3) => h */
154a41b2129STianjia Zhang
155a41b2129STianjia Zhang#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
156a41b2129STianjia Zhang	R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
157a41b2129STianjia Zhang
158a41b2129STianjia Zhang#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
159a41b2129STianjia Zhang	R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
160a41b2129STianjia Zhang
161a41b2129STianjia Zhang#define KL(round) \
162a41b2129STianjia Zhang	ldp	k_even, k_odd, [RKPTR, #(4*(round))];
163a41b2129STianjia Zhang
164a41b2129STianjia Zhang/* Input expansion macros. */
165a41b2129STianjia Zhang
166a41b2129STianjia Zhang/* Byte-swapped input address. */
167a41b2129STianjia Zhang#define IW_W_ADDR(round, widx, offs) \
168a41b2129STianjia Zhang	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
169a41b2129STianjia Zhang
170a41b2129STianjia Zhang/* Expanded input address. */
171a41b2129STianjia Zhang#define XW_W_ADDR(round, widx, offs) \
172a41b2129STianjia Zhang	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
173a41b2129STianjia Zhang
174a41b2129STianjia Zhang/* Rounds 1-12, byte-swapped input block addresses. */
175a41b2129STianjia Zhang#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 32)
176a41b2129STianjia Zhang#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
177a41b2129STianjia Zhang
178a41b2129STianjia Zhang/* Rounds 1-12, expanded input block addresses. */
179a41b2129STianjia Zhang#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
180a41b2129STianjia Zhang#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
181a41b2129STianjia Zhang
182a41b2129STianjia Zhang/* Input block loading.
183a41b2129STianjia Zhang * Interleaving within round function needed for in-order CPUs. */
184a41b2129STianjia Zhang#define LOAD_W_VEC_1_1() \
185a41b2129STianjia Zhang	add	addr0, sp, #IW_W1_ADDR(0, 0);
186a41b2129STianjia Zhang#define LOAD_W_VEC_1_2() \
187a41b2129STianjia Zhang	add	addr1, sp, #IW_W1_ADDR(4, 0);
188a41b2129STianjia Zhang#define LOAD_W_VEC_1_3() \
189a41b2129STianjia Zhang	ld1	{W0.16b}, [RDATA], #16;
190a41b2129STianjia Zhang#define LOAD_W_VEC_1_4() \
191a41b2129STianjia Zhang	ld1	{W1.16b}, [RDATA], #16;
192a41b2129STianjia Zhang#define LOAD_W_VEC_1_5() \
193a41b2129STianjia Zhang	ld1	{W2.16b}, [RDATA], #16;
194a41b2129STianjia Zhang#define LOAD_W_VEC_1_6() \
195a41b2129STianjia Zhang	ld1	{W3.16b}, [RDATA], #16;
196a41b2129STianjia Zhang#define LOAD_W_VEC_1_7() \
197a41b2129STianjia Zhang	rev32	XTMP0.16b, W0.16b;
198a41b2129STianjia Zhang#define LOAD_W_VEC_1_8() \
199a41b2129STianjia Zhang	rev32	XTMP1.16b, W1.16b;
200a41b2129STianjia Zhang#define LOAD_W_VEC_2_1() \
201a41b2129STianjia Zhang	rev32	XTMP2.16b, W2.16b;
202a41b2129STianjia Zhang#define LOAD_W_VEC_2_2() \
203a41b2129STianjia Zhang	rev32	XTMP3.16b, W3.16b;
204a41b2129STianjia Zhang#define LOAD_W_VEC_2_3() \
205a41b2129STianjia Zhang	eor	XTMP4.16b, XTMP1.16b, XTMP0.16b;
206a41b2129STianjia Zhang#define LOAD_W_VEC_2_4() \
207a41b2129STianjia Zhang	eor	XTMP5.16b, XTMP2.16b, XTMP1.16b;
208a41b2129STianjia Zhang#define LOAD_W_VEC_2_5() \
209a41b2129STianjia Zhang	st1	{XTMP0.16b}, [addr0], #16;
210a41b2129STianjia Zhang#define LOAD_W_VEC_2_6() \
211a41b2129STianjia Zhang	st1	{XTMP4.16b}, [addr0]; \
212a41b2129STianjia Zhang	add	addr0, sp, #IW_W1_ADDR(8, 0);
213a41b2129STianjia Zhang#define LOAD_W_VEC_2_7() \
214a41b2129STianjia Zhang	eor	XTMP6.16b, XTMP3.16b, XTMP2.16b;
215a41b2129STianjia Zhang#define LOAD_W_VEC_2_8() \
216a41b2129STianjia Zhang	ext	W0.16b, XTMP0.16b, XTMP0.16b, #8;  /* W0: xx, w0, xx, xx */
217a41b2129STianjia Zhang#define LOAD_W_VEC_3_1() \
218a41b2129STianjia Zhang	mov	W2.16b, XTMP1.16b;                 /* W2: xx, w6, w5, w4 */
219a41b2129STianjia Zhang#define LOAD_W_VEC_3_2() \
220a41b2129STianjia Zhang	st1	{XTMP1.16b}, [addr1], #16;
221a41b2129STianjia Zhang#define LOAD_W_VEC_3_3() \
222a41b2129STianjia Zhang	st1	{XTMP5.16b}, [addr1]; \
223a41b2129STianjia Zhang	ext	W1.16b, XTMP0.16b, XTMP0.16b, #4;  /* W1: xx, w3, w2, w1 */
224a41b2129STianjia Zhang#define LOAD_W_VEC_3_4() \
225a41b2129STianjia Zhang	ext	W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
226a41b2129STianjia Zhang#define LOAD_W_VEC_3_5() \
227a41b2129STianjia Zhang	ext	W4.16b, XTMP2.16b, XTMP3.16b, #8;  /* W4: xx, w12, w11, w10 */
228a41b2129STianjia Zhang#define LOAD_W_VEC_3_6() \
229a41b2129STianjia Zhang	st1	{XTMP2.16b}, [addr0], #16;
230a41b2129STianjia Zhang#define LOAD_W_VEC_3_7() \
231a41b2129STianjia Zhang	st1	{XTMP6.16b}, [addr0];
232a41b2129STianjia Zhang#define LOAD_W_VEC_3_8() \
233a41b2129STianjia Zhang	ext	W5.16b, XTMP3.16b, XTMP3.16b, #4;  /* W5: xx, w15, w14, w13 */
234a41b2129STianjia Zhang
235a41b2129STianjia Zhang#define LOAD_W_VEC_1(iop_num, ...) \
236a41b2129STianjia Zhang	LOAD_W_VEC_1_##iop_num()
237a41b2129STianjia Zhang#define LOAD_W_VEC_2(iop_num, ...) \
238a41b2129STianjia Zhang	LOAD_W_VEC_2_##iop_num()
239a41b2129STianjia Zhang#define LOAD_W_VEC_3(iop_num, ...) \
240a41b2129STianjia Zhang	LOAD_W_VEC_3_##iop_num()
241a41b2129STianjia Zhang
242a41b2129STianjia Zhang/* Message scheduling. Note: 3 words per vector register.
243a41b2129STianjia Zhang * Interleaving within round function needed for in-order CPUs. */
244a41b2129STianjia Zhang#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
245a41b2129STianjia Zhang	/* Load (w[i - 16]) => XTMP0 */            \
246a41b2129STianjia Zhang	/* Load (w[i - 13]) => XTMP5 */            \
247a41b2129STianjia Zhang	ext	XTMP0.16b, w0.16b, w0.16b, #12;    /* XTMP0: w0, xx, xx, xx */
248a41b2129STianjia Zhang#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
249a41b2129STianjia Zhang	ext	XTMP5.16b, w1.16b, w1.16b, #12;
250a41b2129STianjia Zhang#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
251a41b2129STianjia Zhang	ext	XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
252a41b2129STianjia Zhang#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
253a41b2129STianjia Zhang	ext	XTMP5.16b, XTMP5.16b, w2.16b, #12;
254a41b2129STianjia Zhang#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
255a41b2129STianjia Zhang	/* w[i - 9] == w3 */                       \
256a41b2129STianjia Zhang	/* W3 ^ XTMP0 => XTMP0 */                  \
257a41b2129STianjia Zhang	eor	XTMP0.16b, XTMP0.16b, w3.16b;
258a41b2129STianjia Zhang#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
259a41b2129STianjia Zhang	/* w[i - 3] == w5 */                       \
260a41b2129STianjia Zhang	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */       \
261a41b2129STianjia Zhang	/* rol(XTMP5, 7) => XTMP1 */               \
262a41b2129STianjia Zhang	add	addr0, sp, #XW_W1_ADDR((round), 0); \
263a41b2129STianjia Zhang	shl	XTMP2.4s, w5.4s, #15;
264a41b2129STianjia Zhang#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
265a41b2129STianjia Zhang	shl	XTMP1.4s, XTMP5.4s, #7;
266a41b2129STianjia Zhang#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
267a41b2129STianjia Zhang	sri	XTMP2.4s, w5.4s, #(32-15);
268a41b2129STianjia Zhang#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
269a41b2129STianjia Zhang	sri	XTMP1.4s, XTMP5.4s, #(32-7);
270a41b2129STianjia Zhang#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
271a41b2129STianjia Zhang	eor	XTMP0.16b, XTMP0.16b, XTMP2.16b;
272a41b2129STianjia Zhang#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
273a41b2129STianjia Zhang	/* w[i - 6] == W4 */                       \
274a41b2129STianjia Zhang	/* W4 ^ XTMP1 => XTMP1 */                  \
275a41b2129STianjia Zhang	eor	XTMP1.16b, XTMP1.16b, w4.16b;
276a41b2129STianjia Zhang#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
277a41b2129STianjia Zhang	/* P1(XTMP0) ^ XTMP1 => W0 */              \
278a41b2129STianjia Zhang	shl	XTMP3.4s, XTMP0.4s, #15;
279a41b2129STianjia Zhang#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
280a41b2129STianjia Zhang	shl	XTMP4.4s, XTMP0.4s, #23;
281a41b2129STianjia Zhang#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
282a41b2129STianjia Zhang	eor	w0.16b, XTMP1.16b, XTMP0.16b;
283a41b2129STianjia Zhang#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
284a41b2129STianjia Zhang	sri	XTMP3.4s, XTMP0.4s, #(32-15);
285a41b2129STianjia Zhang#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
286a41b2129STianjia Zhang	sri	XTMP4.4s, XTMP0.4s, #(32-23);
287a41b2129STianjia Zhang#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
288a41b2129STianjia Zhang	eor	w0.16b, w0.16b, XTMP3.16b;
289a41b2129STianjia Zhang#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
290a41b2129STianjia Zhang	/* Load (w[i - 3]) => XTMP2 */             \
291a41b2129STianjia Zhang	ext	XTMP2.16b, w4.16b, w4.16b, #12;
292a41b2129STianjia Zhang#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
293a41b2129STianjia Zhang	eor	w0.16b, w0.16b, XTMP4.16b;
294a41b2129STianjia Zhang#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
295a41b2129STianjia Zhang	ext	XTMP2.16b, XTMP2.16b, w5.16b, #12;
296a41b2129STianjia Zhang#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
297a41b2129STianjia Zhang	/* W1 ^ W2 => XTMP3 */                     \
298a41b2129STianjia Zhang	eor	XTMP3.16b, XTMP2.16b, w0.16b;
299a41b2129STianjia Zhang#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
300a41b2129STianjia Zhang#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
301a41b2129STianjia Zhang	st1	{XTMP2.16b-XTMP3.16b}, [addr0];
302a41b2129STianjia Zhang#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
303a41b2129STianjia Zhang
304a41b2129STianjia Zhang#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
305a41b2129STianjia Zhang	SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
306a41b2129STianjia Zhang#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
307a41b2129STianjia Zhang	SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
308a41b2129STianjia Zhang#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
309a41b2129STianjia Zhang	SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
310a41b2129STianjia Zhang
311a41b2129STianjia Zhang#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
312a41b2129STianjia Zhang	SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
313a41b2129STianjia Zhang#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
314a41b2129STianjia Zhang	SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
315a41b2129STianjia Zhang#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
316a41b2129STianjia Zhang	SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
317a41b2129STianjia Zhang
318a41b2129STianjia Zhang#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
319a41b2129STianjia Zhang	SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
320a41b2129STianjia Zhang#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
321a41b2129STianjia Zhang	SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
322a41b2129STianjia Zhang#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
323a41b2129STianjia Zhang	SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
324a41b2129STianjia Zhang
325a41b2129STianjia Zhang#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
326a41b2129STianjia Zhang	SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
327a41b2129STianjia Zhang#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
328a41b2129STianjia Zhang	SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
329a41b2129STianjia Zhang#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
330a41b2129STianjia Zhang	SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
331a41b2129STianjia Zhang
332a41b2129STianjia Zhang#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
333a41b2129STianjia Zhang	SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
334a41b2129STianjia Zhang#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
335a41b2129STianjia Zhang	SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
336a41b2129STianjia Zhang#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
337a41b2129STianjia Zhang	SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
338a41b2129STianjia Zhang
339a41b2129STianjia Zhang#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
340a41b2129STianjia Zhang	SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
341a41b2129STianjia Zhang#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
342a41b2129STianjia Zhang	SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
343a41b2129STianjia Zhang#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
344a41b2129STianjia Zhang	SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
345a41b2129STianjia Zhang
346a41b2129STianjia Zhang
347a41b2129STianjia Zhang	/*
348a41b2129STianjia Zhang	 * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
349a41b2129STianjia Zhang	 *
350a41b2129STianjia Zhang	 * void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
351a41b2129STianjia Zhang	 *                         int blocks)
352a41b2129STianjia Zhang	 */
353a41b2129STianjia Zhang	.text
354a41b2129STianjia Zhang.align 3
355*be8f6b64SEric BiggersSYM_TYPED_FUNC_START(sm3_neon_transform)
356a41b2129STianjia Zhang	ldp		ra, rb, [RSTATE, #0]
357a41b2129STianjia Zhang	ldp		rc, rd, [RSTATE, #8]
358a41b2129STianjia Zhang	ldp		re, rf, [RSTATE, #16]
359a41b2129STianjia Zhang	ldp		rg, rh, [RSTATE, #24]
360a41b2129STianjia Zhang
361a41b2129STianjia Zhang	stp		x28, x29, [sp, #-16]!
362a41b2129STianjia Zhang	stp		x19, x20, [sp, #-16]!
363a41b2129STianjia Zhang	stp		x21, x22, [sp, #-16]!
364a41b2129STianjia Zhang	stp		x23, x24, [sp, #-16]!
365a41b2129STianjia Zhang	stp		x25, x26, [sp, #-16]!
366a41b2129STianjia Zhang	mov		RFRAME, sp
367a41b2129STianjia Zhang
368a41b2129STianjia Zhang	sub		addr0, sp, #STACK_SIZE
369a41b2129STianjia Zhang	adr_l		RKPTR, .LKtable
370a41b2129STianjia Zhang	and		sp, addr0, #(~63)
371a41b2129STianjia Zhang
372a41b2129STianjia Zhang	/* Preload first block. */
373a41b2129STianjia Zhang	LOAD_W_VEC_1(1, 0)
374a41b2129STianjia Zhang	LOAD_W_VEC_1(2, 0)
375a41b2129STianjia Zhang	LOAD_W_VEC_1(3, 0)
376a41b2129STianjia Zhang	LOAD_W_VEC_1(4, 0)
377a41b2129STianjia Zhang	LOAD_W_VEC_1(5, 0)
378a41b2129STianjia Zhang	LOAD_W_VEC_1(6, 0)
379a41b2129STianjia Zhang	LOAD_W_VEC_1(7, 0)
380a41b2129STianjia Zhang	LOAD_W_VEC_1(8, 0)
381a41b2129STianjia Zhang	LOAD_W_VEC_2(1, 0)
382a41b2129STianjia Zhang	LOAD_W_VEC_2(2, 0)
383a41b2129STianjia Zhang	LOAD_W_VEC_2(3, 0)
384a41b2129STianjia Zhang	LOAD_W_VEC_2(4, 0)
385a41b2129STianjia Zhang	LOAD_W_VEC_2(5, 0)
386a41b2129STianjia Zhang	LOAD_W_VEC_2(6, 0)
387a41b2129STianjia Zhang	LOAD_W_VEC_2(7, 0)
388a41b2129STianjia Zhang	LOAD_W_VEC_2(8, 0)
389a41b2129STianjia Zhang	LOAD_W_VEC_3(1, 0)
390a41b2129STianjia Zhang	LOAD_W_VEC_3(2, 0)
391a41b2129STianjia Zhang	LOAD_W_VEC_3(3, 0)
392a41b2129STianjia Zhang	LOAD_W_VEC_3(4, 0)
393a41b2129STianjia Zhang	LOAD_W_VEC_3(5, 0)
394a41b2129STianjia Zhang	LOAD_W_VEC_3(6, 0)
395a41b2129STianjia Zhang	LOAD_W_VEC_3(7, 0)
396a41b2129STianjia Zhang	LOAD_W_VEC_3(8, 0)
397a41b2129STianjia Zhang
398a41b2129STianjia Zhang.balign 16
399a41b2129STianjia Zhang.Loop:
400a41b2129STianjia Zhang	/* Transform 0-3 */
401a41b2129STianjia Zhang	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
402a41b2129STianjia Zhang	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  1, 1, IW, _, 0)
403a41b2129STianjia Zhang	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
404a41b2129STianjia Zhang	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  3, 3, IW, _, 0)
405a41b2129STianjia Zhang
406a41b2129STianjia Zhang	/* Transform 4-7 + Precalc 12-14 */
407a41b2129STianjia Zhang	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
408a41b2129STianjia Zhang	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  5, 1, IW, _, 0)
409a41b2129STianjia Zhang	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
410a41b2129STianjia Zhang	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
411a41b2129STianjia Zhang
412a41b2129STianjia Zhang	/* Transform 8-11 + Precalc 12-17 */
413a41b2129STianjia Zhang	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
414a41b2129STianjia Zhang	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
415a41b2129STianjia Zhang	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
416a41b2129STianjia Zhang	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
417a41b2129STianjia Zhang
418a41b2129STianjia Zhang	/* Transform 12-14 + Precalc 18-20 */
419a41b2129STianjia Zhang	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
420a41b2129STianjia Zhang	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
421a41b2129STianjia Zhang	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
422a41b2129STianjia Zhang
423a41b2129STianjia Zhang	/* Transform 15-17 + Precalc 21-23 */
424a41b2129STianjia Zhang	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
425a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
426a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
427a41b2129STianjia Zhang
428a41b2129STianjia Zhang	/* Transform 18-20 + Precalc 24-26 */
429a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
430a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
431a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
432a41b2129STianjia Zhang
433a41b2129STianjia Zhang	/* Transform 21-23 + Precalc 27-29 */
434a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
435a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
436a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
437a41b2129STianjia Zhang
438a41b2129STianjia Zhang	/* Transform 24-26 + Precalc 30-32 */
439a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
440a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
441a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
442a41b2129STianjia Zhang
443a41b2129STianjia Zhang	/* Transform 27-29 + Precalc 33-35 */
444a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
445a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
446a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
447a41b2129STianjia Zhang
448a41b2129STianjia Zhang	/* Transform 30-32 + Precalc 36-38 */
449a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
450a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
451a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
452a41b2129STianjia Zhang
453a41b2129STianjia Zhang	/* Transform 33-35 + Precalc 39-41 */
454a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
455a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
456a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
457a41b2129STianjia Zhang
458a41b2129STianjia Zhang	/* Transform 36-38 + Precalc 42-44 */
459a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
460a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
461a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
462a41b2129STianjia Zhang
463a41b2129STianjia Zhang	/* Transform 39-41 + Precalc 45-47 */
464a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
465a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
466a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
467a41b2129STianjia Zhang
468a41b2129STianjia Zhang	/* Transform 42-44 + Precalc 48-50 */
469a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
470a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
471a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
472a41b2129STianjia Zhang
473a41b2129STianjia Zhang	/* Transform 45-47 + Precalc 51-53 */
474a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
475a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
476a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
477a41b2129STianjia Zhang
478a41b2129STianjia Zhang	/* Transform 48-50 + Precalc 54-56 */
479a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
480a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
481a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
482a41b2129STianjia Zhang
483a41b2129STianjia Zhang	/* Transform 51-53 + Precalc 57-59 */
484a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
485a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
486a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
487a41b2129STianjia Zhang
488a41b2129STianjia Zhang	/* Transform 54-56 + Precalc 60-62 */
489a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
490a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
491a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
492a41b2129STianjia Zhang
493a41b2129STianjia Zhang	/* Transform 57-59 + Precalc 63 */
494a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
495a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
496a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
497a41b2129STianjia Zhang
498a41b2129STianjia Zhang	/* Transform 60 */
499a41b2129STianjia Zhang	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
500a41b2129STianjia Zhang	subs		RNBLKS, RNBLKS, #1
501a41b2129STianjia Zhang	b.eq		.Lend
502a41b2129STianjia Zhang
503a41b2129STianjia Zhang	/* Transform 61-63 + Preload next block */
504a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, LOAD_W_VEC_1, _)
505a41b2129STianjia Zhang	ldp		s0, s1, [RSTATE, #0]
506a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
507a41b2129STianjia Zhang	ldp		s2, s3, [RSTATE, #8]
508a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, LOAD_W_VEC_3, _)
509a41b2129STianjia Zhang
510a41b2129STianjia Zhang	/* Update the chaining variables. */
511a41b2129STianjia Zhang	eor		ra, ra, s0
512a41b2129STianjia Zhang	eor		rb, rb, s1
513a41b2129STianjia Zhang	ldp		s0, s1, [RSTATE, #16]
514a41b2129STianjia Zhang	eor		rc, rc, s2
515a41b2129STianjia Zhang	ldp		k_even, k_odd, [RSTATE, #24]
516a41b2129STianjia Zhang	eor		rd, rd, s3
517a41b2129STianjia Zhang	eor		re, re, s0
518a41b2129STianjia Zhang	stp		ra, rb, [RSTATE, #0]
519a41b2129STianjia Zhang	eor		rf, rf, s1
520a41b2129STianjia Zhang	stp		rc, rd, [RSTATE, #8]
521a41b2129STianjia Zhang	eor		rg, rg, k_even
522a41b2129STianjia Zhang	stp		re, rf, [RSTATE, #16]
523a41b2129STianjia Zhang	eor		rh, rh, k_odd
524a41b2129STianjia Zhang	stp		rg, rh, [RSTATE, #24]
525a41b2129STianjia Zhang	b		.Loop
526a41b2129STianjia Zhang
527a41b2129STianjia Zhang.Lend:
528a41b2129STianjia Zhang	/* Transform 61-63 */
529a41b2129STianjia Zhang	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, _, _)
530a41b2129STianjia Zhang	ldp		s0, s1, [RSTATE, #0]
531a41b2129STianjia Zhang	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
532a41b2129STianjia Zhang	ldp		s2, s3, [RSTATE, #8]
533a41b2129STianjia Zhang	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, _, _)
534a41b2129STianjia Zhang
535a41b2129STianjia Zhang	/* Update the chaining variables. */
536a41b2129STianjia Zhang	eor		ra, ra, s0
537a41b2129STianjia Zhang	clear_vec(W0)
538a41b2129STianjia Zhang	eor		rb, rb, s1
539a41b2129STianjia Zhang	clear_vec(W1)
540a41b2129STianjia Zhang	ldp		s0, s1, [RSTATE, #16]
541a41b2129STianjia Zhang	clear_vec(W2)
542a41b2129STianjia Zhang	eor		rc, rc, s2
543a41b2129STianjia Zhang	clear_vec(W3)
544a41b2129STianjia Zhang	ldp		k_even, k_odd, [RSTATE, #24]
545a41b2129STianjia Zhang	clear_vec(W4)
546a41b2129STianjia Zhang	eor		rd, rd, s3
547a41b2129STianjia Zhang	clear_vec(W5)
548a41b2129STianjia Zhang	eor		re, re, s0
549a41b2129STianjia Zhang	clear_vec(XTMP0)
550a41b2129STianjia Zhang	stp		ra, rb, [RSTATE, #0]
551a41b2129STianjia Zhang	clear_vec(XTMP1)
552a41b2129STianjia Zhang	eor		rf, rf, s1
553a41b2129STianjia Zhang	clear_vec(XTMP2)
554a41b2129STianjia Zhang	stp		rc, rd, [RSTATE, #8]
555a41b2129STianjia Zhang	clear_vec(XTMP3)
556a41b2129STianjia Zhang	eor		rg, rg, k_even
557a41b2129STianjia Zhang	clear_vec(XTMP4)
558a41b2129STianjia Zhang	stp		re, rf, [RSTATE, #16]
559a41b2129STianjia Zhang	clear_vec(XTMP5)
560a41b2129STianjia Zhang	eor		rh, rh, k_odd
561a41b2129STianjia Zhang	clear_vec(XTMP6)
562a41b2129STianjia Zhang	stp		rg, rh, [RSTATE, #24]
563a41b2129STianjia Zhang
564a41b2129STianjia Zhang	/* Clear message expansion area */
565a41b2129STianjia Zhang	add		addr0, sp, #STACK_W
566a41b2129STianjia Zhang	st1		{W0.16b-W3.16b}, [addr0], #64
567a41b2129STianjia Zhang	st1		{W0.16b-W3.16b}, [addr0], #64
568a41b2129STianjia Zhang	st1		{W0.16b-W3.16b}, [addr0]
569a41b2129STianjia Zhang
570a41b2129STianjia Zhang	mov		sp, RFRAME
571a41b2129STianjia Zhang
572a41b2129STianjia Zhang	ldp		x25, x26, [sp], #16
573a41b2129STianjia Zhang	ldp		x23, x24, [sp], #16
574a41b2129STianjia Zhang	ldp		x21, x22, [sp], #16
575a41b2129STianjia Zhang	ldp		x19, x20, [sp], #16
576a41b2129STianjia Zhang	ldp		x28, x29, [sp], #16
577a41b2129STianjia Zhang
578a41b2129STianjia Zhang	ret
579a41b2129STianjia ZhangSYM_FUNC_END(sm3_neon_transform)
580a41b2129STianjia Zhang
581a41b2129STianjia Zhang
582a41b2129STianjia Zhang	.section	".rodata", "a"
583a41b2129STianjia Zhang
584a41b2129STianjia Zhang	.align 4
585a41b2129STianjia Zhang.LKtable:
586a41b2129STianjia Zhang	.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
587a41b2129STianjia Zhang	.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
588a41b2129STianjia Zhang	.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
589a41b2129STianjia Zhang	.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
590a41b2129STianjia Zhang	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
591a41b2129STianjia Zhang	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
592a41b2129STianjia Zhang	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
593a41b2129STianjia Zhang	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
594a41b2129STianjia Zhang	.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
595a41b2129STianjia Zhang	.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
596a41b2129STianjia Zhang	.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
597a41b2129STianjia Zhang	.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
598a41b2129STianjia Zhang	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
599a41b2129STianjia Zhang	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
600a41b2129STianjia Zhang	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
601a41b2129STianjia Zhang	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
602