1/*
2 * Bit sliced AES using NEON instructions
3 *
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/*
12 * The algorithm implemented here is described in detail by the paper
13 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
14 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
15 *
16 * This implementation is based primarily on the OpenSSL implementation
17 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
18 */
19
20#include <linux/linkage.h>
21#include <asm/assembler.h>
22
23	.text
24
25	rounds		.req	x11
26	bskey		.req	x12
27
28	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
29	eor		\b2, \b2, \b1
30	eor		\b5, \b5, \b6
31	eor		\b3, \b3, \b0
32	eor		\b6, \b6, \b2
33	eor		\b5, \b5, \b0
34	eor		\b6, \b6, \b3
35	eor		\b3, \b3, \b7
36	eor		\b7, \b7, \b5
37	eor		\b3, \b3, \b4
38	eor		\b4, \b4, \b5
39	eor		\b2, \b2, \b7
40	eor		\b3, \b3, \b1
41	eor		\b1, \b1, \b5
42	.endm
43
44	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
45	eor		\b0, \b0, \b6
46	eor		\b1, \b1, \b4
47	eor		\b4, \b4, \b6
48	eor		\b2, \b2, \b0
49	eor		\b6, \b6, \b1
50	eor		\b1, \b1, \b5
51	eor		\b5, \b5, \b3
52	eor		\b3, \b3, \b7
53	eor		\b7, \b7, \b5
54	eor		\b2, \b2, \b5
55	eor		\b4, \b4, \b7
56	.endm
57
58	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
59	eor		\b1, \b1, \b7
60	eor		\b4, \b4, \b7
61	eor		\b7, \b7, \b5
62	eor		\b1, \b1, \b3
63	eor		\b2, \b2, \b5
64	eor		\b3, \b3, \b7
65	eor		\b6, \b6, \b1
66	eor		\b2, \b2, \b0
67	eor		\b5, \b5, \b3
68	eor		\b4, \b4, \b6
69	eor		\b0, \b0, \b6
70	eor		\b1, \b1, \b4
71	.endm
72
73	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
74	eor		\b1, \b1, \b5
75	eor		\b2, \b2, \b7
76	eor		\b3, \b3, \b1
77	eor		\b4, \b4, \b5
78	eor		\b7, \b7, \b5
79	eor		\b3, \b3, \b4
80	eor 		\b5, \b5, \b0
81	eor		\b3, \b3, \b7
82	eor		\b6, \b6, \b2
83	eor		\b2, \b2, \b1
84	eor		\b6, \b6, \b3
85	eor		\b3, \b3, \b0
86	eor		\b5, \b5, \b6
87	.endm
88
89	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
90	eor 		\t0, \y0, \y1
91	and		\t0, \t0, \x0
92	eor		\x0, \x0, \x1
93	and		\t1, \x1, \y0
94	and		\x0, \x0, \y1
95	eor		\x1, \t1, \t0
96	eor		\x0, \x0, \t1
97	.endm
98
99	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
100	eor		\t0, \y0, \y1
101	eor 		\t1, \y2, \y3
102	and		\t0, \t0, \x0
103	and		\t1, \t1, \x2
104	eor		\x0, \x0, \x1
105	eor		\x2, \x2, \x3
106	and		\x1, \x1, \y0
107	and		\x3, \x3, \y2
108	and		\x0, \x0, \y1
109	and		\x2, \x2, \y3
110	eor		\x1, \x1, \x0
111	eor		\x2, \x2, \x3
112	eor		\x0, \x0, \t0
113	eor		\x3, \x3, \t1
114	.endm
115
116	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
117				    y0, y1, y2, y3, t0, t1, t2, t3
118	eor		\t0, \x0, \x2
119	eor		\t1, \x1, \x3
120	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
121	eor		\y0, \y0, \y2
122	eor		\y1, \y1, \y3
123	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
124	eor		\x0, \x0, \t0
125	eor		\x2, \x2, \t0
126	eor		\x1, \x1, \t1
127	eor		\x3, \x3, \t1
128	eor		\t0, \x4, \x6
129	eor		\t1, \x5, \x7
130	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
131	eor		\y0, \y0, \y2
132	eor		\y1, \y1, \y3
133	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
134	eor		\x4, \x4, \t0
135	eor		\x6, \x6, \t0
136	eor		\x5, \x5, \t1
137	eor		\x7, \x7, \t1
138	.endm
139
140	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
141				   t0, t1, t2, t3, s0, s1, s2, s3
142	eor		\t3, \x4, \x6
143	eor		\t0, \x5, \x7
144	eor		\t1, \x1, \x3
145	eor		\s1, \x7, \x6
146	eor		\s0, \x0, \x2
147	eor		\s3, \t3, \t0
148	orr		\t2, \t0, \t1
149	and		\s2, \t3, \s0
150	orr		\t3, \t3, \s0
151	eor		\s0, \s0, \t1
152	and		\t0, \t0, \t1
153	eor		\t1, \x3, \x2
154	and		\s3, \s3, \s0
155	and		\s1, \s1, \t1
156	eor		\t1, \x4, \x5
157	eor		\s0, \x1, \x0
158	eor		\t3, \t3, \s1
159	eor		\t2, \t2, \s1
160	and		\s1, \t1, \s0
161	orr		\t1, \t1, \s0
162	eor		\t3, \t3, \s3
163	eor		\t0, \t0, \s1
164	eor		\t2, \t2, \s2
165	eor		\t1, \t1, \s3
166	eor		\t0, \t0, \s2
167	and		\s0, \x7, \x3
168	eor		\t1, \t1, \s2
169	and		\s1, \x6, \x2
170	and		\s2, \x5, \x1
171	orr		\s3, \x4, \x0
172	eor		\t3, \t3, \s0
173	eor		\t1, \t1, \s2
174	eor		\s0, \t0, \s3
175	eor		\t2, \t2, \s1
176	and		\s2, \t3, \t1
177	eor		\s1, \t2, \s2
178	eor		\s3, \s0, \s2
179	bsl		\s1, \t1, \s0
180	not		\t0, \s0
181	bsl		\s0, \s1, \s3
182	bsl		\t0, \s1, \s3
183	bsl		\s3, \t3, \t2
184	eor		\t3, \t3, \t2
185	and		\s2, \s0, \s3
186	eor		\t1, \t1, \t0
187	eor		\s2, \s2, \t3
188	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
189			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
190	.endm
191
192	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
193			      t0, t1, t2, t3, s0, s1, s2, s3
194	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
195			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
196	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
197			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
199			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
200	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
201			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
202	.endm
203
204	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
205				  t0, t1, t2, t3, s0, s1, s2, s3
206	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
207			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
208	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
209			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
211			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
212	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
213			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
214	.endm
215
216	.macro		enc_next_rk
217	ldp		q16, q17, [bskey], #128
218	ldp		q18, q19, [bskey, #-96]
219	ldp		q20, q21, [bskey, #-64]
220	ldp		q22, q23, [bskey, #-32]
221	.endm
222
223	.macro		dec_next_rk
224	ldp		q16, q17, [bskey, #-128]!
225	ldp		q18, q19, [bskey, #32]
226	ldp		q20, q21, [bskey, #64]
227	ldp		q22, q23, [bskey, #96]
228	.endm
229
230	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
231	eor		\x0\().16b, \x0\().16b, v16.16b
232	eor		\x1\().16b, \x1\().16b, v17.16b
233	eor		\x2\().16b, \x2\().16b, v18.16b
234	eor		\x3\().16b, \x3\().16b, v19.16b
235	eor		\x4\().16b, \x4\().16b, v20.16b
236	eor		\x5\().16b, \x5\().16b, v21.16b
237	eor		\x6\().16b, \x6\().16b, v22.16b
238	eor		\x7\().16b, \x7\().16b, v23.16b
239	.endm
240
241	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
242	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
243	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
244	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
245	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
246	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
247	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
248	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
249	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
250	.endm
251
252	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
253				  t0, t1, t2, t3, t4, t5, t6, t7, inv
254	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
255	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
256	eor		\x0\().16b, \x0\().16b, \t0\().16b
257	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
258	eor		\x1\().16b, \x1\().16b, \t1\().16b
259	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
260	eor		\x2\().16b, \x2\().16b, \t2\().16b
261	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
262	eor		\x3\().16b, \x3\().16b, \t3\().16b
263	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
264	eor		\x4\().16b, \x4\().16b, \t4\().16b
265	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
266	eor		\x5\().16b, \x5\().16b, \t5\().16b
267	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
268	eor		\x6\().16b, \x6\().16b, \t6\().16b
269	eor		\t1\().16b, \t1\().16b, \x0\().16b
270	eor		\x7\().16b, \x7\().16b, \t7\().16b
271	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
272	eor		\t2\().16b, \t2\().16b, \x1\().16b
273	eor		\t0\().16b, \t0\().16b, \x7\().16b
274	eor		\t1\().16b, \t1\().16b, \x7\().16b
275	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
276	eor		\t5\().16b, \t5\().16b, \x4\().16b
277	eor		\x0\().16b, \x0\().16b, \t0\().16b
278	eor		\t6\().16b, \t6\().16b, \x5\().16b
279	eor		\x1\().16b, \x1\().16b, \t1\().16b
280	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
281	eor		\t4\().16b, \t4\().16b, \x3\().16b
282	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
283	eor		\t7\().16b, \t7\().16b, \x6\().16b
284	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
285	eor		\t3\().16b, \t3\().16b, \x2\().16b
286	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
287	eor		\t4\().16b, \t4\().16b, \x7\().16b
288	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
289	eor		\t3\().16b, \t3\().16b, \x7\().16b
290	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
291	eor		\x7\().16b, \t1\().16b, \t5\().16b
292	.ifb		\inv
293	eor		\x2\().16b, \t0\().16b, \t4\().16b
294	eor		\x4\().16b, \x4\().16b, \t3\().16b
295	eor		\x5\().16b, \x5\().16b, \t7\().16b
296	eor		\x3\().16b, \x3\().16b, \t6\().16b
297	eor		\x6\().16b, \x6\().16b, \t2\().16b
298	.else
299	eor		\t3\().16b, \t3\().16b, \x4\().16b
300	eor		\x5\().16b, \x5\().16b, \t7\().16b
301	eor		\x2\().16b, \x3\().16b, \t6\().16b
302	eor		\x3\().16b, \t0\().16b, \t4\().16b
303	eor		\x4\().16b, \x6\().16b, \t2\().16b
304	mov		\x6\().16b, \t3\().16b
305	.endif
306	.endm
307
308	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
309				      t0, t1, t2, t3, t4, t5, t6, t7
310	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
311	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
312	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
313	eor		\t0\().16b, \t0\().16b, \x0\().16b
314	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
315	eor		\t6\().16b, \t6\().16b, \x6\().16b
316	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
317	eor		\t7\().16b, \t7\().16b, \x7\().16b
318	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
319	eor		\t1\().16b, \t1\().16b, \x1\().16b
320	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
321	eor		\t2\().16b, \t2\().16b, \x2\().16b
322	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
323	eor		\t3\().16b, \t3\().16b, \x3\().16b
324	eor		\t4\().16b, \t4\().16b, \x4\().16b
325	eor		\t5\().16b, \t5\().16b, \x5\().16b
326	eor		\x0\().16b, \x0\().16b, \t6\().16b
327	eor		\x1\().16b, \x1\().16b, \t6\().16b
328	eor		\x2\().16b, \x2\().16b, \t0\().16b
329	eor		\x4\().16b, \x4\().16b, \t2\().16b
330	eor		\x3\().16b, \x3\().16b, \t1\().16b
331	eor		\x1\().16b, \x1\().16b, \t7\().16b
332	eor		\x2\().16b, \x2\().16b, \t7\().16b
333	eor		\x4\().16b, \x4\().16b, \t6\().16b
334	eor		\x5\().16b, \x5\().16b, \t3\().16b
335	eor		\x3\().16b, \x3\().16b, \t6\().16b
336	eor		\x6\().16b, \x6\().16b, \t4\().16b
337	eor		\x4\().16b, \x4\().16b, \t7\().16b
338	eor		\x5\().16b, \x5\().16b, \t7\().16b
339	eor		\x7\().16b, \x7\().16b, \t5\().16b
340	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
341			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
342	.endm
343
344	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
345	ushr		\t0\().2d, \b0\().2d, #\n
346	ushr		\t1\().2d, \b1\().2d, #\n
347	eor		\t0\().16b, \t0\().16b, \a0\().16b
348	eor		\t1\().16b, \t1\().16b, \a1\().16b
349	and		\t0\().16b, \t0\().16b, \mask\().16b
350	and		\t1\().16b, \t1\().16b, \mask\().16b
351	eor		\a0\().16b, \a0\().16b, \t0\().16b
352	shl		\t0\().2d, \t0\().2d, #\n
353	eor		\a1\().16b, \a1\().16b, \t1\().16b
354	shl		\t1\().2d, \t1\().2d, #\n
355	eor		\b0\().16b, \b0\().16b, \t0\().16b
356	eor		\b1\().16b, \b1\().16b, \t1\().16b
357	.endm
358
359	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
360	movi		\t0\().16b, #0x55
361	movi		\t1\().16b, #0x33
362	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
363	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
364	movi		\t0\().16b, #0x0f
365	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
366	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
367	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
368	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
369	.endm
370
371
372	.align		6
373M0:	.octa		0x0004080c0105090d02060a0e03070b0f
374
375M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
376SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
377SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
378
379M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
380ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
381ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
382
383	/*
384	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
385	 */
386ENTRY(aesbs_convert_key)
387	ld1		{v7.4s}, [x1], #16		// load round 0 key
388	ld1		{v17.4s}, [x1], #16		// load round 1 key
389
390	movi		v8.16b,  #0x01			// bit masks
391	movi		v9.16b,  #0x02
392	movi		v10.16b, #0x04
393	movi		v11.16b, #0x08
394	movi		v12.16b, #0x10
395	movi		v13.16b, #0x20
396	movi		v14.16b, #0x40
397	movi		v15.16b, #0x80
398	ldr		q16, M0
399
400	sub		x2, x2, #1
401	str		q7, [x0], #16		// save round 0 key
402
403.Lkey_loop:
404	tbl		v7.16b ,{v17.16b}, v16.16b
405	ld1		{v17.4s}, [x1], #16		// load next round key
406
407	cmtst		v0.16b, v7.16b, v8.16b
408	cmtst		v1.16b, v7.16b, v9.16b
409	cmtst		v2.16b, v7.16b, v10.16b
410	cmtst		v3.16b, v7.16b, v11.16b
411	cmtst		v4.16b, v7.16b, v12.16b
412	cmtst		v5.16b, v7.16b, v13.16b
413	cmtst		v6.16b, v7.16b, v14.16b
414	cmtst		v7.16b, v7.16b, v15.16b
415	not		v0.16b, v0.16b
416	not		v1.16b, v1.16b
417	not		v5.16b, v5.16b
418	not		v6.16b, v6.16b
419
420	subs		x2, x2, #1
421	stp		q0, q1, [x0], #128
422	stp		q2, q3, [x0, #-96]
423	stp		q4, q5, [x0, #-64]
424	stp		q6, q7, [x0, #-32]
425	b.ne		.Lkey_loop
426
427	movi		v7.16b, #0x63			// compose .L63
428	eor		v17.16b, v17.16b, v7.16b
429	str		q17, [x0]
430	ret
431ENDPROC(aesbs_convert_key)
432
433	.align		4
434aesbs_encrypt8:
435	ldr		q9, [bskey], #16		// round 0 key
436	ldr		q8, M0SR
437	ldr		q24, SR
438
439	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
440	eor		v11.16b, v1.16b, v9.16b
441	tbl		v0.16b, {v10.16b}, v8.16b
442	eor		v12.16b, v2.16b, v9.16b
443	tbl		v1.16b, {v11.16b}, v8.16b
444	eor		v13.16b, v3.16b, v9.16b
445	tbl		v2.16b, {v12.16b}, v8.16b
446	eor		v14.16b, v4.16b, v9.16b
447	tbl		v3.16b, {v13.16b}, v8.16b
448	eor		v15.16b, v5.16b, v9.16b
449	tbl		v4.16b, {v14.16b}, v8.16b
450	eor		v10.16b, v6.16b, v9.16b
451	tbl		v5.16b, {v15.16b}, v8.16b
452	eor		v11.16b, v7.16b, v9.16b
453	tbl		v6.16b, {v10.16b}, v8.16b
454	tbl		v7.16b, {v11.16b}, v8.16b
455
456	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
457
458	sub		rounds, rounds, #1
459	b		.Lenc_sbox
460
461.Lenc_loop:
462	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
463.Lenc_sbox:
464	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
465								v13, v14, v15
466	subs		rounds, rounds, #1
467	b.cc		.Lenc_done
468
469	enc_next_rk
470
471	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
472								v13, v14, v15
473
474	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
475
476	b.ne		.Lenc_loop
477	ldr		q24, SRM0
478	b		.Lenc_loop
479
480.Lenc_done:
481	ldr		q12, [bskey]			// last round key
482
483	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
484
485	eor		v0.16b, v0.16b, v12.16b
486	eor		v1.16b, v1.16b, v12.16b
487	eor		v4.16b, v4.16b, v12.16b
488	eor		v6.16b, v6.16b, v12.16b
489	eor		v3.16b, v3.16b, v12.16b
490	eor		v7.16b, v7.16b, v12.16b
491	eor		v2.16b, v2.16b, v12.16b
492	eor		v5.16b, v5.16b, v12.16b
493	ret
494ENDPROC(aesbs_encrypt8)
495
496	.align		4
497aesbs_decrypt8:
498	lsl		x9, rounds, #7
499	add		bskey, bskey, x9
500
501	ldr		q9, [bskey, #-112]!		// round 0 key
502	ldr		q8, M0ISR
503	ldr		q24, ISR
504
505	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
506	eor		v11.16b, v1.16b, v9.16b
507	tbl		v0.16b, {v10.16b}, v8.16b
508	eor		v12.16b, v2.16b, v9.16b
509	tbl		v1.16b, {v11.16b}, v8.16b
510	eor		v13.16b, v3.16b, v9.16b
511	tbl		v2.16b, {v12.16b}, v8.16b
512	eor		v14.16b, v4.16b, v9.16b
513	tbl		v3.16b, {v13.16b}, v8.16b
514	eor		v15.16b, v5.16b, v9.16b
515	tbl		v4.16b, {v14.16b}, v8.16b
516	eor		v10.16b, v6.16b, v9.16b
517	tbl		v5.16b, {v15.16b}, v8.16b
518	eor		v11.16b, v7.16b, v9.16b
519	tbl		v6.16b, {v10.16b}, v8.16b
520	tbl		v7.16b, {v11.16b}, v8.16b
521
522	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
523
524	sub		rounds, rounds, #1
525	b		.Ldec_sbox
526
527.Ldec_loop:
528	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
529.Ldec_sbox:
530	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
531								v13, v14, v15
532	subs		rounds, rounds, #1
533	b.cc		.Ldec_done
534
535	dec_next_rk
536
537	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
538
539	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
540								v13, v14, v15
541
542	b.ne		.Ldec_loop
543	ldr		q24, ISRM0
544	b		.Ldec_loop
545.Ldec_done:
546	ldr		q12, [bskey, #-16]		// last round key
547
548	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
549
550	eor		v0.16b, v0.16b, v12.16b
551	eor		v1.16b, v1.16b, v12.16b
552	eor		v6.16b, v6.16b, v12.16b
553	eor		v4.16b, v4.16b, v12.16b
554	eor		v2.16b, v2.16b, v12.16b
555	eor		v7.16b, v7.16b, v12.16b
556	eor		v3.16b, v3.16b, v12.16b
557	eor		v5.16b, v5.16b, v12.16b
558	ret
559ENDPROC(aesbs_decrypt8)
560
561	/*
562	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563	 *		     int blocks)
564	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
565	 *		     int blocks)
566	 */
567	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
568	stp		x29, x30, [sp, #-16]!
569	mov		x29, sp
570
57199:	mov		x5, #1
572	lsl		x5, x5, x4
573	subs		w4, w4, #8
574	csel		x4, x4, xzr, pl
575	csel		x5, x5, xzr, mi
576
577	ld1		{v0.16b}, [x1], #16
578	tbnz		x5, #1, 0f
579	ld1		{v1.16b}, [x1], #16
580	tbnz		x5, #2, 0f
581	ld1		{v2.16b}, [x1], #16
582	tbnz		x5, #3, 0f
583	ld1		{v3.16b}, [x1], #16
584	tbnz		x5, #4, 0f
585	ld1		{v4.16b}, [x1], #16
586	tbnz		x5, #5, 0f
587	ld1		{v5.16b}, [x1], #16
588	tbnz		x5, #6, 0f
589	ld1		{v6.16b}, [x1], #16
590	tbnz		x5, #7, 0f
591	ld1		{v7.16b}, [x1], #16
592
5930:	mov		bskey, x2
594	mov		rounds, x3
595	bl		\do8
596
597	st1		{\o0\().16b}, [x0], #16
598	tbnz		x5, #1, 1f
599	st1		{\o1\().16b}, [x0], #16
600	tbnz		x5, #2, 1f
601	st1		{\o2\().16b}, [x0], #16
602	tbnz		x5, #3, 1f
603	st1		{\o3\().16b}, [x0], #16
604	tbnz		x5, #4, 1f
605	st1		{\o4\().16b}, [x0], #16
606	tbnz		x5, #5, 1f
607	st1		{\o5\().16b}, [x0], #16
608	tbnz		x5, #6, 1f
609	st1		{\o6\().16b}, [x0], #16
610	tbnz		x5, #7, 1f
611	st1		{\o7\().16b}, [x0], #16
612
613	cbnz		x4, 99b
614
6151:	ldp		x29, x30, [sp], #16
616	ret
617	.endm
618
619	.align		4
620ENTRY(aesbs_ecb_encrypt)
621	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
622ENDPROC(aesbs_ecb_encrypt)
623
624	.align		4
625ENTRY(aesbs_ecb_decrypt)
626	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
627ENDPROC(aesbs_ecb_decrypt)
628
629	/*
630	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
631	 *		     int blocks, u8 iv[])
632	 */
633	.align		4
634ENTRY(aesbs_cbc_decrypt)
635	stp		x29, x30, [sp, #-16]!
636	mov		x29, sp
637
63899:	mov		x6, #1
639	lsl		x6, x6, x4
640	subs		w4, w4, #8
641	csel		x4, x4, xzr, pl
642	csel		x6, x6, xzr, mi
643
644	ld1		{v0.16b}, [x1], #16
645	mov		v25.16b, v0.16b
646	tbnz		x6, #1, 0f
647	ld1		{v1.16b}, [x1], #16
648	mov		v26.16b, v1.16b
649	tbnz		x6, #2, 0f
650	ld1		{v2.16b}, [x1], #16
651	mov		v27.16b, v2.16b
652	tbnz		x6, #3, 0f
653	ld1		{v3.16b}, [x1], #16
654	mov		v28.16b, v3.16b
655	tbnz		x6, #4, 0f
656	ld1		{v4.16b}, [x1], #16
657	mov		v29.16b, v4.16b
658	tbnz		x6, #5, 0f
659	ld1		{v5.16b}, [x1], #16
660	mov		v30.16b, v5.16b
661	tbnz		x6, #6, 0f
662	ld1		{v6.16b}, [x1], #16
663	mov		v31.16b, v6.16b
664	tbnz		x6, #7, 0f
665	ld1		{v7.16b}, [x1]
666
6670:	mov		bskey, x2
668	mov		rounds, x3
669	bl		aesbs_decrypt8
670
671	ld1		{v24.16b}, [x5]			// load IV
672
673	eor		v1.16b, v1.16b, v25.16b
674	eor		v6.16b, v6.16b, v26.16b
675	eor		v4.16b, v4.16b, v27.16b
676	eor		v2.16b, v2.16b, v28.16b
677	eor		v7.16b, v7.16b, v29.16b
678	eor		v0.16b, v0.16b, v24.16b
679	eor		v3.16b, v3.16b, v30.16b
680	eor		v5.16b, v5.16b, v31.16b
681
682	st1		{v0.16b}, [x0], #16
683	mov		v24.16b, v25.16b
684	tbnz		x6, #1, 1f
685	st1		{v1.16b}, [x0], #16
686	mov		v24.16b, v26.16b
687	tbnz		x6, #2, 1f
688	st1		{v6.16b}, [x0], #16
689	mov		v24.16b, v27.16b
690	tbnz		x6, #3, 1f
691	st1		{v4.16b}, [x0], #16
692	mov		v24.16b, v28.16b
693	tbnz		x6, #4, 1f
694	st1		{v2.16b}, [x0], #16
695	mov		v24.16b, v29.16b
696	tbnz		x6, #5, 1f
697	st1		{v7.16b}, [x0], #16
698	mov		v24.16b, v30.16b
699	tbnz		x6, #6, 1f
700	st1		{v3.16b}, [x0], #16
701	mov		v24.16b, v31.16b
702	tbnz		x6, #7, 1f
703	ld1		{v24.16b}, [x1], #16
704	st1		{v5.16b}, [x0], #16
7051:	st1		{v24.16b}, [x5]			// store IV
706
707	cbnz		x4, 99b
708
709	ldp		x29, x30, [sp], #16
710	ret
711ENDPROC(aesbs_cbc_decrypt)
712
713	.macro		next_tweak, out, in, const, tmp
714	sshr		\tmp\().2d,  \in\().2d,   #63
715	and		\tmp\().16b, \tmp\().16b, \const\().16b
716	add		\out\().2d,  \in\().2d,   \in\().2d
717	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
718	eor		\out\().16b, \out\().16b, \tmp\().16b
719	.endm
720
721	.align		4
722.Lxts_mul_x:
723CPU_LE(	.quad		1, 0x87		)
724CPU_BE(	.quad		0x87, 1		)
725
726	/*
727	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
728	 *		     int blocks, u8 iv[])
729	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
730	 *		     int blocks, u8 iv[])
731	 */
732__xts_crypt8:
733	mov		x6, #1
734	lsl		x6, x6, x4
735	subs		w4, w4, #8
736	csel		x4, x4, xzr, pl
737	csel		x6, x6, xzr, mi
738
739	ld1		{v0.16b}, [x1], #16
740	next_tweak	v26, v25, v30, v31
741	eor		v0.16b, v0.16b, v25.16b
742	tbnz		x6, #1, 0f
743
744	ld1		{v1.16b}, [x1], #16
745	next_tweak	v27, v26, v30, v31
746	eor		v1.16b, v1.16b, v26.16b
747	tbnz		x6, #2, 0f
748
749	ld1		{v2.16b}, [x1], #16
750	next_tweak	v28, v27, v30, v31
751	eor		v2.16b, v2.16b, v27.16b
752	tbnz		x6, #3, 0f
753
754	ld1		{v3.16b}, [x1], #16
755	next_tweak	v29, v28, v30, v31
756	eor		v3.16b, v3.16b, v28.16b
757	tbnz		x6, #4, 0f
758
759	ld1		{v4.16b}, [x1], #16
760	str		q29, [sp, #16]
761	eor		v4.16b, v4.16b, v29.16b
762	next_tweak	v29, v29, v30, v31
763	tbnz		x6, #5, 0f
764
765	ld1		{v5.16b}, [x1], #16
766	str		q29, [sp, #32]
767	eor		v5.16b, v5.16b, v29.16b
768	next_tweak	v29, v29, v30, v31
769	tbnz		x6, #6, 0f
770
771	ld1		{v6.16b}, [x1], #16
772	str		q29, [sp, #48]
773	eor		v6.16b, v6.16b, v29.16b
774	next_tweak	v29, v29, v30, v31
775	tbnz		x6, #7, 0f
776
777	ld1		{v7.16b}, [x1], #16
778	str		q29, [sp, #64]
779	eor		v7.16b, v7.16b, v29.16b
780	next_tweak	v29, v29, v30, v31
781
7820:	mov		bskey, x2
783	mov		rounds, x3
784	br		x7
785ENDPROC(__xts_crypt8)
786
787	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
788	stp		x29, x30, [sp, #-80]!
789	mov		x29, sp
790
791	ldr		q30, .Lxts_mul_x
792	ld1		{v25.16b}, [x5]
793
79499:	adr		x7, \do8
795	bl		__xts_crypt8
796
797	ldp		q16, q17, [sp, #16]
798	ldp		q18, q19, [sp, #48]
799
800	eor		\o0\().16b, \o0\().16b, v25.16b
801	eor		\o1\().16b, \o1\().16b, v26.16b
802	eor		\o2\().16b, \o2\().16b, v27.16b
803	eor		\o3\().16b, \o3\().16b, v28.16b
804
805	st1		{\o0\().16b}, [x0], #16
806	mov		v25.16b, v26.16b
807	tbnz		x6, #1, 1f
808	st1		{\o1\().16b}, [x0], #16
809	mov		v25.16b, v27.16b
810	tbnz		x6, #2, 1f
811	st1		{\o2\().16b}, [x0], #16
812	mov		v25.16b, v28.16b
813	tbnz		x6, #3, 1f
814	st1		{\o3\().16b}, [x0], #16
815	mov		v25.16b, v29.16b
816	tbnz		x6, #4, 1f
817
818	eor		\o4\().16b, \o4\().16b, v16.16b
819	eor		\o5\().16b, \o5\().16b, v17.16b
820	eor		\o6\().16b, \o6\().16b, v18.16b
821	eor		\o7\().16b, \o7\().16b, v19.16b
822
823	st1		{\o4\().16b}, [x0], #16
824	tbnz		x6, #5, 1f
825	st1		{\o5\().16b}, [x0], #16
826	tbnz		x6, #6, 1f
827	st1		{\o6\().16b}, [x0], #16
828	tbnz		x6, #7, 1f
829	st1		{\o7\().16b}, [x0], #16
830
831	cbnz		x4, 99b
832
8331:	st1		{v25.16b}, [x5]
834	ldp		x29, x30, [sp], #80
835	ret
836	.endm
837
838ENTRY(aesbs_xts_encrypt)
839	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
840ENDPROC(aesbs_xts_encrypt)
841
842ENTRY(aesbs_xts_decrypt)
843	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
844ENDPROC(aesbs_xts_decrypt)
845
846	.macro		next_ctr, v
847	mov		\v\().d[1], x8
848	adds		x8, x8, #1
849	mov		\v\().d[0], x7
850	adc		x7, x7, xzr
851	rev64		\v\().16b, \v\().16b
852	.endm
853
854	/*
855	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
856	 *		     int rounds, int blocks, u8 iv[], u8 final[])
857	 */
858ENTRY(aesbs_ctr_encrypt)
859	stp		x29, x30, [sp, #-16]!
860	mov		x29, sp
861
862	cmp		x6, #0
863	cset		x10, ne
864	add		x4, x4, x10		// do one extra block if final
865
866	ldp		x7, x8, [x5]
867	ld1		{v0.16b}, [x5]
868CPU_LE(	rev		x7, x7		)
869CPU_LE(	rev		x8, x8		)
870	adds		x8, x8, #1
871	adc		x7, x7, xzr
872
87399:	mov		x9, #1
874	lsl		x9, x9, x4
875	subs		w4, w4, #8
876	csel		x4, x4, xzr, pl
877	csel		x9, x9, xzr, le
878
879	tbnz		x9, #1, 0f
880	next_ctr	v1
881	tbnz		x9, #2, 0f
882	next_ctr	v2
883	tbnz		x9, #3, 0f
884	next_ctr	v3
885	tbnz		x9, #4, 0f
886	next_ctr	v4
887	tbnz		x9, #5, 0f
888	next_ctr	v5
889	tbnz		x9, #6, 0f
890	next_ctr	v6
891	tbnz		x9, #7, 0f
892	next_ctr	v7
893
8940:	mov		bskey, x2
895	mov		rounds, x3
896	bl		aesbs_encrypt8
897
898	lsr		x9, x9, x10		// disregard the extra block
899	tbnz		x9, #0, 0f
900
901	ld1		{v8.16b}, [x1], #16
902	eor		v0.16b, v0.16b, v8.16b
903	st1		{v0.16b}, [x0], #16
904	tbnz		x9, #1, 1f
905
906	ld1		{v9.16b}, [x1], #16
907	eor		v1.16b, v1.16b, v9.16b
908	st1		{v1.16b}, [x0], #16
909	tbnz		x9, #2, 2f
910
911	ld1		{v10.16b}, [x1], #16
912	eor		v4.16b, v4.16b, v10.16b
913	st1		{v4.16b}, [x0], #16
914	tbnz		x9, #3, 3f
915
916	ld1		{v11.16b}, [x1], #16
917	eor		v6.16b, v6.16b, v11.16b
918	st1		{v6.16b}, [x0], #16
919	tbnz		x9, #4, 4f
920
921	ld1		{v12.16b}, [x1], #16
922	eor		v3.16b, v3.16b, v12.16b
923	st1		{v3.16b}, [x0], #16
924	tbnz		x9, #5, 5f
925
926	ld1		{v13.16b}, [x1], #16
927	eor		v7.16b, v7.16b, v13.16b
928	st1		{v7.16b}, [x0], #16
929	tbnz		x9, #6, 6f
930
931	ld1		{v14.16b}, [x1], #16
932	eor		v2.16b, v2.16b, v14.16b
933	st1		{v2.16b}, [x0], #16
934	tbnz		x9, #7, 7f
935
936	ld1		{v15.16b}, [x1], #16
937	eor		v5.16b, v5.16b, v15.16b
938	st1		{v5.16b}, [x0], #16
939
9408:	next_ctr	v0
941	cbnz		x4, 99b
942
9430:	st1		{v0.16b}, [x5]
944	ldp		x29, x30, [sp], #16
945	ret
946
947	/*
948	 * If we are handling the tail of the input (x6 != NULL), return the
949	 * final keystream block back to the caller.
950	 */
9511:	cbz		x6, 8b
952	st1		{v1.16b}, [x6]
953	b		8b
9542:	cbz		x6, 8b
955	st1		{v4.16b}, [x6]
956	b		8b
9573:	cbz		x6, 8b
958	st1		{v6.16b}, [x6]
959	b		8b
9604:	cbz		x6, 8b
961	st1		{v3.16b}, [x6]
962	b		8b
9635:	cbz		x6, 8b
964	st1		{v7.16b}, [x6]
965	b		8b
9666:	cbz		x6, 8b
967	st1		{v2.16b}, [x6]
968	b		8b
9697:	cbz		x6, 8b
970	st1		{v5.16b}, [x6]
971	b		8b
972ENDPROC(aesbs_ctr_encrypt)
973