xref: /openbmc/linux/arch/arm64/crypto/aes-neon.S (revision 87d08b11)
1/*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14#define AES_ENTRY(func)		ENTRY(neon_ ## func)
15#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
16
17	/* multiply by polynomial 'x' in GF(2^8) */
18	.macro		mul_by_x, out, in, temp, const
19	sshr		\temp, \in, #7
20	add		\out, \in, \in
21	and		\temp, \temp, \const
22	eor		\out, \out, \temp
23	.endm
24
25	/* preload the entire Sbox */
26	.macro		prepare, sbox, shiftrows, temp
27	adr		\temp, \sbox
28	movi		v12.16b, #0x40
29	ldr		q13, \shiftrows
30	movi		v14.16b, #0x1b
31	ld1		{v16.16b-v19.16b}, [\temp], #64
32	ld1		{v20.16b-v23.16b}, [\temp], #64
33	ld1		{v24.16b-v27.16b}, [\temp], #64
34	ld1		{v28.16b-v31.16b}, [\temp]
35	.endm
36
37	/* do preload for encryption */
38	.macro		enc_prepare, ignore0, ignore1, temp
39	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
40	.endm
41
42	.macro		enc_switch_key, ignore0, ignore1, temp
43	/* do nothing */
44	.endm
45
46	/* do preload for decryption */
47	.macro		dec_prepare, ignore0, ignore1, temp
48	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
49	.endm
50
51	/* apply SubBytes transformation using the the preloaded Sbox */
52	.macro		sub_bytes, in
53	sub		v9.16b, \in\().16b, v12.16b
54	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
55	sub		v10.16b, v9.16b, v12.16b
56	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
57	sub		v11.16b, v10.16b, v12.16b
58	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
59	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
60	.endm
61
62	/* apply MixColumns transformation */
63	.macro		mix_columns, in
64	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
65	rev32		v8.8h, \in\().8h
66	eor		\in\().16b, v10.16b, \in\().16b
67	shl		v9.4s, v8.4s, #24
68	shl		v11.4s, \in\().4s, #24
69	sri		v9.4s, v8.4s, #8
70	sri		v11.4s, \in\().4s, #8
71	eor		v9.16b, v9.16b, v8.16b
72	eor		v10.16b, v10.16b, v9.16b
73	eor		\in\().16b, v10.16b, v11.16b
74	.endm
75
76	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
77	.macro		inv_mix_columns, in
78	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
79	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
80	eor		\in\().16b, \in\().16b, v11.16b
81	rev32		v11.8h, v11.8h
82	eor		\in\().16b, \in\().16b, v11.16b
83	mix_columns	\in
84	.endm
85
86	.macro		do_block, enc, in, rounds, rk, rkp, i
87	ld1		{v15.4s}, [\rk]
88	add		\rkp, \rk, #16
89	mov		\i, \rounds
901111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
91	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
92	sub_bytes	\in
93	ld1		{v15.4s}, [\rkp], #16
94	subs		\i, \i, #1
95	beq		2222f
96	.if		\enc == 1
97	mix_columns	\in
98	.else
99	inv_mix_columns	\in
100	.endif
101	b		1111b
1022222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
103	.endm
104
105	.macro		encrypt_block, in, rounds, rk, rkp, i
106	do_block	1, \in, \rounds, \rk, \rkp, \i
107	.endm
108
109	.macro		decrypt_block, in, rounds, rk, rkp, i
110	do_block	0, \in, \rounds, \rk, \rkp, \i
111	.endm
112
113	/*
114	 * Interleaved versions: functionally equivalent to the
115	 * ones above, but applied to 2 or 4 AES states in parallel.
116	 */
117
118	.macro		sub_bytes_2x, in0, in1
119	sub		v8.16b, \in0\().16b, v12.16b
120	sub		v9.16b, \in1\().16b, v12.16b
121	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
122	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
123	sub		v10.16b, v8.16b, v12.16b
124	sub		v11.16b, v9.16b, v12.16b
125	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
126	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
127	sub		v8.16b, v10.16b, v12.16b
128	sub		v9.16b, v11.16b, v12.16b
129	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
130	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
131	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
132	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
133	.endm
134
135	.macro		sub_bytes_4x, in0, in1, in2, in3
136	sub		v8.16b, \in0\().16b, v12.16b
137	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
138	sub		v9.16b, \in1\().16b, v12.16b
139	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
140	sub		v10.16b, \in2\().16b, v12.16b
141	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
142	sub		v11.16b, \in3\().16b, v12.16b
143	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
144	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
145	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
146	sub		v8.16b, v8.16b, v12.16b
147	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
148	sub		v9.16b, v9.16b, v12.16b
149	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
150	sub		v10.16b, v10.16b, v12.16b
151	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
152	sub		v11.16b, v11.16b, v12.16b
153	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
154	sub		v8.16b, v8.16b, v12.16b
155	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
156	sub		v9.16b, v9.16b, v12.16b
157	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
158	sub		v10.16b, v10.16b, v12.16b
159	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
160	sub		v11.16b, v11.16b, v12.16b
161	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
162	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
163	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
164	.endm
165
166	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
167	sshr		\tmp0\().16b, \in0\().16b,  #7
168	add		\out0\().16b, \in0\().16b,  \in0\().16b
169	sshr		\tmp1\().16b, \in1\().16b,  #7
170	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
171	add		\out1\().16b, \in1\().16b,  \in1\().16b
172	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
173	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
174	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
175	.endm
176
177	.macro		mix_columns_2x, in0, in1
178	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
179	rev32		v10.8h, \in0\().8h
180	rev32		v11.8h, \in1\().8h
181	eor		\in0\().16b, v8.16b, \in0\().16b
182	eor		\in1\().16b, v9.16b, \in1\().16b
183	shl		v12.4s, v10.4s, #24
184	shl		v13.4s, v11.4s, #24
185	eor		v8.16b, v8.16b, v10.16b
186	sri		v12.4s, v10.4s, #8
187	shl		v10.4s, \in0\().4s, #24
188	eor		v9.16b, v9.16b, v11.16b
189	sri		v13.4s, v11.4s, #8
190	shl		v11.4s, \in1\().4s, #24
191	sri		v10.4s, \in0\().4s, #8
192	eor		\in0\().16b, v8.16b, v12.16b
193	sri		v11.4s, \in1\().4s, #8
194	eor		\in1\().16b, v9.16b, v13.16b
195	eor		\in0\().16b, v10.16b, \in0\().16b
196	eor		\in1\().16b, v11.16b, \in1\().16b
197	.endm
198
199	.macro		inv_mix_cols_2x, in0, in1
200	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
201	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
202	eor		\in0\().16b, \in0\().16b, v8.16b
203	eor		\in1\().16b, \in1\().16b, v9.16b
204	rev32		v8.8h, v8.8h
205	rev32		v9.8h, v9.8h
206	eor		\in0\().16b, \in0\().16b, v8.16b
207	eor		\in1\().16b, \in1\().16b, v9.16b
208	mix_columns_2x	\in0, \in1
209	.endm
210
211	.macro		inv_mix_cols_4x, in0, in1, in2, in3
212	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
213	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
214	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
215	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
216	eor		\in0\().16b, \in0\().16b, v8.16b
217	eor		\in1\().16b, \in1\().16b, v9.16b
218	eor		\in2\().16b, \in2\().16b, v10.16b
219	eor		\in3\().16b, \in3\().16b, v11.16b
220	rev32		v8.8h, v8.8h
221	rev32		v9.8h, v9.8h
222	rev32		v10.8h, v10.8h
223	rev32		v11.8h, v11.8h
224	eor		\in0\().16b, \in0\().16b, v8.16b
225	eor		\in1\().16b, \in1\().16b, v9.16b
226	eor		\in2\().16b, \in2\().16b, v10.16b
227	eor		\in3\().16b, \in3\().16b, v11.16b
228	mix_columns_2x	\in0, \in1
229	mix_columns_2x	\in2, \in3
230	.endm
231
232	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
233	ld1		{v15.4s}, [\rk]
234	add		\rkp, \rk, #16
235	mov		\i, \rounds
2361111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
237	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
238	sub_bytes_2x	\in0, \in1
239	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
240	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
241	ld1		{v15.4s}, [\rkp], #16
242	subs		\i, \i, #1
243	beq		2222f
244	.if		\enc == 1
245	mix_columns_2x	\in0, \in1
246	ldr		q13, .LForward_ShiftRows
247	.else
248	inv_mix_cols_2x	\in0, \in1
249	ldr		q13, .LReverse_ShiftRows
250	.endif
251	movi		v12.16b, #0x40
252	b		1111b
2532222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
254	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
255	.endm
256
257	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
258	ld1		{v15.4s}, [\rk]
259	add		\rkp, \rk, #16
260	mov		\i, \rounds
2611111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
262	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
263	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
264	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
265	sub_bytes_4x	\in0, \in1, \in2, \in3
266	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
267	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
268	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
269	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
270	ld1		{v15.4s}, [\rkp], #16
271	subs		\i, \i, #1
272	beq		2222f
273	.if		\enc == 1
274	mix_columns_2x	\in0, \in1
275	mix_columns_2x	\in2, \in3
276	ldr		q13, .LForward_ShiftRows
277	.else
278	inv_mix_cols_4x	\in0, \in1, \in2, \in3
279	ldr		q13, .LReverse_ShiftRows
280	.endif
281	movi		v12.16b, #0x40
282	b		1111b
2832222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
284	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
285	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
286	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
287	.endm
288
289	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
290	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
291	.endm
292
293	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
294	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
295	.endm
296
297	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
298	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
299	.endm
300
301	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
302	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
303	.endm
304
305#include "aes-modes.S"
306
307	.text
308	.align		4
309.LForward_ShiftRows:
310CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
311CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
312CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
313CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
314
315.LReverse_ShiftRows:
316CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
317CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
318CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
319CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
320
321.LForward_Sbox:
322	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
323	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
324	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
325	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
326	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
327	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
328	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
329	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
330	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
331	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
332	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
333	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
334	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
335	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
336	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
337	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
338	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
339	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
340	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
341	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
342	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
343	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
344	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
345	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
346	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
347	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
348	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
349	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
350	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
351	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
352	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
353	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
354
355.LReverse_Sbox:
356	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
357	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
358	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
359	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
360	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
361	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
362	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
363	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
364	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
365	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
366	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
367	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
368	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
369	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
370	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
371	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
372	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
373	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
374	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
375	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
376	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
377	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
378	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
379	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
380	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
381	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
382	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
383	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
384	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
385	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
386	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
387	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
388