xref: /openbmc/linux/arch/arm64/crypto/aes-neon.S (revision e983940270f10fe8551baf0098be76ea478294a3)
1/*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12
13#define AES_ENTRY(func)		ENTRY(neon_ ## func)
14#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
15
16	/* multiply by polynomial 'x' in GF(2^8) */
17	.macro		mul_by_x, out, in, temp, const
18	sshr		\temp, \in, #7
19	add		\out, \in, \in
20	and		\temp, \temp, \const
21	eor		\out, \out, \temp
22	.endm
23
24	/* preload the entire Sbox */
25	.macro		prepare, sbox, shiftrows, temp
26	adr		\temp, \sbox
27	movi		v12.16b, #0x40
28	ldr		q13, \shiftrows
29	movi		v14.16b, #0x1b
30	ld1		{v16.16b-v19.16b}, [\temp], #64
31	ld1		{v20.16b-v23.16b}, [\temp], #64
32	ld1		{v24.16b-v27.16b}, [\temp], #64
33	ld1		{v28.16b-v31.16b}, [\temp]
34	.endm
35
36	/* do preload for encryption */
37	.macro		enc_prepare, ignore0, ignore1, temp
38	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
39	.endm
40
41	.macro		enc_switch_key, ignore0, ignore1, temp
42	/* do nothing */
43	.endm
44
45	/* do preload for decryption */
46	.macro		dec_prepare, ignore0, ignore1, temp
47	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
48	.endm
49
50	/* apply SubBytes transformation using the the preloaded Sbox */
51	.macro		sub_bytes, in
52	sub		v9.16b, \in\().16b, v12.16b
53	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
54	sub		v10.16b, v9.16b, v12.16b
55	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
56	sub		v11.16b, v10.16b, v12.16b
57	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
58	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
59	.endm
60
61	/* apply MixColumns transformation */
62	.macro		mix_columns, in
63	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
64	rev32		v8.8h, \in\().8h
65	eor		\in\().16b, v10.16b, \in\().16b
66	shl		v9.4s, v8.4s, #24
67	shl		v11.4s, \in\().4s, #24
68	sri		v9.4s, v8.4s, #8
69	sri		v11.4s, \in\().4s, #8
70	eor		v9.16b, v9.16b, v8.16b
71	eor		v10.16b, v10.16b, v9.16b
72	eor		\in\().16b, v10.16b, v11.16b
73	.endm
74
75	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
76	.macro		inv_mix_columns, in
77	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
78	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
79	eor		\in\().16b, \in\().16b, v11.16b
80	rev32		v11.8h, v11.8h
81	eor		\in\().16b, \in\().16b, v11.16b
82	mix_columns	\in
83	.endm
84
85	.macro		do_block, enc, in, rounds, rk, rkp, i
86	ld1		{v15.16b}, [\rk]
87	add		\rkp, \rk, #16
88	mov		\i, \rounds
891111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
90	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
91	sub_bytes	\in
92	ld1		{v15.16b}, [\rkp], #16
93	subs		\i, \i, #1
94	beq		2222f
95	.if		\enc == 1
96	mix_columns	\in
97	.else
98	inv_mix_columns	\in
99	.endif
100	b		1111b
1012222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
102	.endm
103
104	.macro		encrypt_block, in, rounds, rk, rkp, i
105	do_block	1, \in, \rounds, \rk, \rkp, \i
106	.endm
107
108	.macro		decrypt_block, in, rounds, rk, rkp, i
109	do_block	0, \in, \rounds, \rk, \rkp, \i
110	.endm
111
112	/*
113	 * Interleaved versions: functionally equivalent to the
114	 * ones above, but applied to 2 or 4 AES states in parallel.
115	 */
116
117	.macro		sub_bytes_2x, in0, in1
118	sub		v8.16b, \in0\().16b, v12.16b
119	sub		v9.16b, \in1\().16b, v12.16b
120	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
121	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
122	sub		v10.16b, v8.16b, v12.16b
123	sub		v11.16b, v9.16b, v12.16b
124	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
125	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
126	sub		v8.16b, v10.16b, v12.16b
127	sub		v9.16b, v11.16b, v12.16b
128	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
129	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
130	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
131	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
132	.endm
133
134	.macro		sub_bytes_4x, in0, in1, in2, in3
135	sub		v8.16b, \in0\().16b, v12.16b
136	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
137	sub		v9.16b, \in1\().16b, v12.16b
138	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
139	sub		v10.16b, \in2\().16b, v12.16b
140	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
141	sub		v11.16b, \in3\().16b, v12.16b
142	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
143	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
144	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
145	sub		v8.16b, v8.16b, v12.16b
146	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
147	sub		v9.16b, v9.16b, v12.16b
148	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
149	sub		v10.16b, v10.16b, v12.16b
150	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
151	sub		v11.16b, v11.16b, v12.16b
152	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
153	sub		v8.16b, v8.16b, v12.16b
154	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
155	sub		v9.16b, v9.16b, v12.16b
156	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
157	sub		v10.16b, v10.16b, v12.16b
158	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
159	sub		v11.16b, v11.16b, v12.16b
160	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
161	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
162	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
163	.endm
164
165	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
166	sshr		\tmp0\().16b, \in0\().16b,  #7
167	add		\out0\().16b, \in0\().16b,  \in0\().16b
168	sshr		\tmp1\().16b, \in1\().16b,  #7
169	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
170	add		\out1\().16b, \in1\().16b,  \in1\().16b
171	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
172	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
173	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
174	.endm
175
176	.macro		mix_columns_2x, in0, in1
177	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
178	rev32		v10.8h, \in0\().8h
179	rev32		v11.8h, \in1\().8h
180	eor		\in0\().16b, v8.16b, \in0\().16b
181	eor		\in1\().16b, v9.16b, \in1\().16b
182	shl		v12.4s, v10.4s, #24
183	shl		v13.4s, v11.4s, #24
184	eor		v8.16b, v8.16b, v10.16b
185	sri		v12.4s, v10.4s, #8
186	shl		v10.4s, \in0\().4s, #24
187	eor		v9.16b, v9.16b, v11.16b
188	sri		v13.4s, v11.4s, #8
189	shl		v11.4s, \in1\().4s, #24
190	sri		v10.4s, \in0\().4s, #8
191	eor		\in0\().16b, v8.16b, v12.16b
192	sri		v11.4s, \in1\().4s, #8
193	eor		\in1\().16b, v9.16b, v13.16b
194	eor		\in0\().16b, v10.16b, \in0\().16b
195	eor		\in1\().16b, v11.16b, \in1\().16b
196	.endm
197
198	.macro		inv_mix_cols_2x, in0, in1
199	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
200	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
201	eor		\in0\().16b, \in0\().16b, v8.16b
202	eor		\in1\().16b, \in1\().16b, v9.16b
203	rev32		v8.8h, v8.8h
204	rev32		v9.8h, v9.8h
205	eor		\in0\().16b, \in0\().16b, v8.16b
206	eor		\in1\().16b, \in1\().16b, v9.16b
207	mix_columns_2x	\in0, \in1
208	.endm
209
210	.macro		inv_mix_cols_4x, in0, in1, in2, in3
211	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
212	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
213	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
214	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
215	eor		\in0\().16b, \in0\().16b, v8.16b
216	eor		\in1\().16b, \in1\().16b, v9.16b
217	eor		\in2\().16b, \in2\().16b, v10.16b
218	eor		\in3\().16b, \in3\().16b, v11.16b
219	rev32		v8.8h, v8.8h
220	rev32		v9.8h, v9.8h
221	rev32		v10.8h, v10.8h
222	rev32		v11.8h, v11.8h
223	eor		\in0\().16b, \in0\().16b, v8.16b
224	eor		\in1\().16b, \in1\().16b, v9.16b
225	eor		\in2\().16b, \in2\().16b, v10.16b
226	eor		\in3\().16b, \in3\().16b, v11.16b
227	mix_columns_2x	\in0, \in1
228	mix_columns_2x	\in2, \in3
229	.endm
230
231	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
232	ld1		{v15.16b}, [\rk]
233	add		\rkp, \rk, #16
234	mov		\i, \rounds
2351111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
236	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
237	sub_bytes_2x	\in0, \in1
238	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
239	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
240	ld1		{v15.16b}, [\rkp], #16
241	subs		\i, \i, #1
242	beq		2222f
243	.if		\enc == 1
244	mix_columns_2x	\in0, \in1
245	ldr		q13, .LForward_ShiftRows
246	.else
247	inv_mix_cols_2x	\in0, \in1
248	ldr		q13, .LReverse_ShiftRows
249	.endif
250	movi		v12.16b, #0x40
251	b		1111b
2522222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
253	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
254	.endm
255
256	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
257	ld1		{v15.16b}, [\rk]
258	add		\rkp, \rk, #16
259	mov		\i, \rounds
2601111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
261	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
262	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
263	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
264	sub_bytes_4x	\in0, \in1, \in2, \in3
265	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
266	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
267	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
268	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
269	ld1		{v15.16b}, [\rkp], #16
270	subs		\i, \i, #1
271	beq		2222f
272	.if		\enc == 1
273	mix_columns_2x	\in0, \in1
274	mix_columns_2x	\in2, \in3
275	ldr		q13, .LForward_ShiftRows
276	.else
277	inv_mix_cols_4x	\in0, \in1, \in2, \in3
278	ldr		q13, .LReverse_ShiftRows
279	.endif
280	movi		v12.16b, #0x40
281	b		1111b
2822222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
283	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
284	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
285	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
286	.endm
287
288	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
289	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
290	.endm
291
292	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
293	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
294	.endm
295
296	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
297	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
298	.endm
299
300	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
301	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
302	.endm
303
304#include "aes-modes.S"
305
306	.text
307	.align		4
308.LForward_ShiftRows:
309	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
310	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
311
312.LReverse_ShiftRows:
313	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
314	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
315
316.LForward_Sbox:
317	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
318	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
319	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
320	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
321	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
322	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
323	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
324	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
325	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
326	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
327	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
328	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
329	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
330	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
331	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
332	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
333	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
334	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
335	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
336	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
337	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
338	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
339	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
340	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
341	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
342	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
343	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
344	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
345	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
346	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
347	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
348	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
349
350.LReverse_Sbox:
351	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
352	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
353	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
354	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
355	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
356	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
357	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
358	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
359	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
360	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
361	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
362	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
363	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
364	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
365	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
366	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
367	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
368	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
369	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
370	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
371	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
372	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
373	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
374	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
375	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
376	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
377	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
378	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
379	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
380	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
381	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
382	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
383