xref: /openbmc/linux/arch/arm64/crypto/aes-neon.S (revision ba61bb17)
1/*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14#define AES_ENTRY(func)		ENTRY(neon_ ## func)
15#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
16
17	/* multiply by polynomial 'x' in GF(2^8) */
18	.macro		mul_by_x, out, in, temp, const
19	sshr		\temp, \in, #7
20	shl		\out, \in, #1
21	and		\temp, \temp, \const
22	eor		\out, \out, \temp
23	.endm
24
25	/* multiply by polynomial 'x^2' in GF(2^8) */
26	.macro		mul_by_x2, out, in, temp, const
27	ushr		\temp, \in, #6
28	shl		\out, \in, #2
29	pmul		\temp, \temp, \const
30	eor		\out, \out, \temp
31	.endm
32
33	/* preload the entire Sbox */
34	.macro		prepare, sbox, shiftrows, temp
35	movi		v12.16b, #0x1b
36	ldr_l		q13, \shiftrows, \temp
37	ldr_l		q14, .Lror32by8, \temp
38	adr_l		\temp, \sbox
39	ld1		{v16.16b-v19.16b}, [\temp], #64
40	ld1		{v20.16b-v23.16b}, [\temp], #64
41	ld1		{v24.16b-v27.16b}, [\temp], #64
42	ld1		{v28.16b-v31.16b}, [\temp]
43	.endm
44
45	/* do preload for encryption */
46	.macro		enc_prepare, ignore0, ignore1, temp
47	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
48	.endm
49
50	.macro		enc_switch_key, ignore0, ignore1, temp
51	/* do nothing */
52	.endm
53
54	/* do preload for decryption */
55	.macro		dec_prepare, ignore0, ignore1, temp
56	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
57	.endm
58
59	/* apply SubBytes transformation using the the preloaded Sbox */
60	.macro		sub_bytes, in
61	sub		v9.16b, \in\().16b, v15.16b
62	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
63	sub		v10.16b, v9.16b, v15.16b
64	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
65	sub		v11.16b, v10.16b, v15.16b
66	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
67	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
68	.endm
69
70	/* apply MixColumns transformation */
71	.macro		mix_columns, in, enc
72	.if		\enc == 0
73	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
74	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
75	eor		\in\().16b, \in\().16b, v8.16b
76	rev32		v8.8h, v8.8h
77	eor		\in\().16b, \in\().16b, v8.16b
78	.endif
79
80	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
81	rev32		v8.8h, \in\().8h
82	eor		v8.16b, v8.16b, v9.16b
83	eor		\in\().16b, \in\().16b, v8.16b
84	tbl		\in\().16b, {\in\().16b}, v14.16b
85	eor		\in\().16b, \in\().16b, v8.16b
86	.endm
87
88	.macro		do_block, enc, in, rounds, rk, rkp, i
89	ld1		{v15.4s}, [\rk]
90	add		\rkp, \rk, #16
91	mov		\i, \rounds
921111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
93	movi		v15.16b, #0x40
94	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
95	sub_bytes	\in
96	subs		\i, \i, #1
97	ld1		{v15.4s}, [\rkp], #16
98	beq		2222f
99	mix_columns	\in, \enc
100	b		1111b
1012222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
102	.endm
103
104	.macro		encrypt_block, in, rounds, rk, rkp, i
105	do_block	1, \in, \rounds, \rk, \rkp, \i
106	.endm
107
108	.macro		decrypt_block, in, rounds, rk, rkp, i
109	do_block	0, \in, \rounds, \rk, \rkp, \i
110	.endm
111
112	/*
113	 * Interleaved versions: functionally equivalent to the
114	 * ones above, but applied to 2 or 4 AES states in parallel.
115	 */
116
117	.macro		sub_bytes_2x, in0, in1
118	sub		v8.16b, \in0\().16b, v15.16b
119	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
120	sub		v9.16b, \in1\().16b, v15.16b
121	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
122	sub		v10.16b, v8.16b, v15.16b
123	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
124	sub		v11.16b, v9.16b, v15.16b
125	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
126	sub		v8.16b, v10.16b, v15.16b
127	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
128	sub		v9.16b, v11.16b, v15.16b
129	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
130	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
131	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
132	.endm
133
134	.macro		sub_bytes_4x, in0, in1, in2, in3
135	sub		v8.16b, \in0\().16b, v15.16b
136	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
137	sub		v9.16b, \in1\().16b, v15.16b
138	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
139	sub		v10.16b, \in2\().16b, v15.16b
140	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
141	sub		v11.16b, \in3\().16b, v15.16b
142	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
143	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
144	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
145	sub		v8.16b, v8.16b, v15.16b
146	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
147	sub		v9.16b, v9.16b, v15.16b
148	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
149	sub		v10.16b, v10.16b, v15.16b
150	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
151	sub		v11.16b, v11.16b, v15.16b
152	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
153	sub		v8.16b, v8.16b, v15.16b
154	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
155	sub		v9.16b, v9.16b, v15.16b
156	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
157	sub		v10.16b, v10.16b, v15.16b
158	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
159	sub		v11.16b, v11.16b, v15.16b
160	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
161	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
162	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
163	.endm
164
165	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
166	sshr		\tmp0\().16b, \in0\().16b, #7
167	shl		\out0\().16b, \in0\().16b, #1
168	sshr		\tmp1\().16b, \in1\().16b, #7
169	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
170	shl		\out1\().16b, \in1\().16b, #1
171	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
172	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
173	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
174	.endm
175
176	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
177	ushr		\tmp0\().16b, \in0\().16b, #6
178	shl		\out0\().16b, \in0\().16b, #2
179	ushr		\tmp1\().16b, \in1\().16b, #6
180	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
181	shl		\out1\().16b, \in1\().16b, #2
182	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
183	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
184	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
185	.endm
186
187	.macro		mix_columns_2x, in0, in1, enc
188	.if		\enc == 0
189	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
190	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
191	eor		\in0\().16b, \in0\().16b, v8.16b
192	rev32		v8.8h, v8.8h
193	eor		\in1\().16b, \in1\().16b, v9.16b
194	rev32		v9.8h, v9.8h
195	eor		\in0\().16b, \in0\().16b, v8.16b
196	eor		\in1\().16b, \in1\().16b, v9.16b
197	.endif
198
199	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
200	rev32		v10.8h, \in0\().8h
201	rev32		v11.8h, \in1\().8h
202	eor		v10.16b, v10.16b, v8.16b
203	eor		v11.16b, v11.16b, v9.16b
204	eor		\in0\().16b, \in0\().16b, v10.16b
205	eor		\in1\().16b, \in1\().16b, v11.16b
206	tbl		\in0\().16b, {\in0\().16b}, v14.16b
207	tbl		\in1\().16b, {\in1\().16b}, v14.16b
208	eor		\in0\().16b, \in0\().16b, v10.16b
209	eor		\in1\().16b, \in1\().16b, v11.16b
210	.endm
211
212	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
213	ld1		{v15.4s}, [\rk]
214	add		\rkp, \rk, #16
215	mov		\i, \rounds
2161111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
217	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
218	movi		v15.16b, #0x40
219	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
220	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
221	sub_bytes_2x	\in0, \in1
222	subs		\i, \i, #1
223	ld1		{v15.4s}, [\rkp], #16
224	beq		2222f
225	mix_columns_2x	\in0, \in1, \enc
226	b		1111b
2272222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
228	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
229	.endm
230
231	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
232	ld1		{v15.4s}, [\rk]
233	add		\rkp, \rk, #16
234	mov		\i, \rounds
2351111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
236	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
237	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
238	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
239	movi		v15.16b, #0x40
240	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
241	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
242	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
243	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
244	sub_bytes_4x	\in0, \in1, \in2, \in3
245	subs		\i, \i, #1
246	ld1		{v15.4s}, [\rkp], #16
247	beq		2222f
248	mix_columns_2x	\in0, \in1, \enc
249	mix_columns_2x	\in2, \in3, \enc
250	b		1111b
2512222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
252	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
253	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
254	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
255	.endm
256
257	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
258	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
259	.endm
260
261	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
262	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
263	.endm
264
265	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
266	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
267	.endm
268
269	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
270	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
271	.endm
272
273#include "aes-modes.S"
274
275	.section	".rodata", "a"
276	.align		6
277.LForward_Sbox:
278	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
279	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
280	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
281	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
282	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
283	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
284	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
285	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
286	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
287	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
288	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
289	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
290	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
291	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
292	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
293	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
294	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
295	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
296	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
297	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
298	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
299	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
300	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
301	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
302	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
303	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
304	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
305	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
306	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
307	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
308	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
309	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
310
311.LReverse_Sbox:
312	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
313	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
314	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
315	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
316	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
317	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
318	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
319	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
320	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
321	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
322	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
323	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
324	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
325	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
326	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
327	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
328	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
329	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
330	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
331	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
332	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
333	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
334	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
335	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
336	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
337	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
338	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
339	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
340	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
341	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
342	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
343	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
344
345.LForward_ShiftRows:
346	.octa		0x0b06010c07020d08030e09040f0a0500
347
348.LReverse_ShiftRows:
349	.octa		0x0306090c0f0205080b0e0104070a0d00
350
351.Lror32by8:
352	.octa		0x0c0f0e0d080b0a090407060500030201
353