1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Fast AES implementation for SPE instruction set (PPC)
4 *
5 * This code makes use of the SPE SIMD instruction set as defined in
6 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7 * Implementation is based on optimization guide notes from
8 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 *
10 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 */
12
13#include <asm/ppc_asm.h>
14#include "aes-spe-regs.h"
15
16#define	EAD(in, bpos) \
17	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27;
18
19#define DAD(in, bpos) \
20	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31;
21
22#define LWH(out, off) \
23	evlwwsplat	out,off(rT0);	/* load word high		*/
24
25#define LWL(out, off) \
26	lwz		out,off(rT0);	/* load word low		*/
27
28#define LBZ(out, tab, off) \
29	lbz		out,off(tab);	/* load byte			*/
30
31#define LAH(out, in, bpos, off) \
32	EAD(in, bpos)			/* calc addr + load word high	*/ \
33	LWH(out, off)
34
35#define LAL(out, in, bpos, off) \
36	EAD(in, bpos)			/* calc addr + load word low	*/ \
37	LWL(out, off)
38
39#define LAE(out, in, bpos) \
40	EAD(in, bpos)			/* calc addr + load enc byte	*/ \
41	LBZ(out, rT0, 8)
42
43#define LBE(out) \
44	LBZ(out, rT0, 8)		/* load enc byte		*/
45
46#define LAD(out, in, bpos) \
47	DAD(in, bpos)			/* calc addr + load dec byte	*/ \
48	LBZ(out, rT1, 0)
49
50#define LBD(out) \
51	LBZ(out, rT1, 0)
52
53/*
54 * ppc_encrypt_block: The central encryption function for a single 16 bytes
55 * block. It does no stack handling or register saving to support fast calls
56 * via bl/blr. It expects that caller has pre-xored input data with first
57 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
58 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
59 * and rW0-rW3 and caller must execute a final xor on the output registers.
60 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
61 *
62 */
63_GLOBAL(ppc_encrypt_block)
64	LAH(rW4, rD1, 2, 4)
65	LAH(rW6, rD0, 3, 0)
66	LAH(rW3, rD0, 1, 8)
67ppc_encrypt_block_loop:
68	LAH(rW0, rD3, 0, 12)
69	LAL(rW0, rD0, 0, 12)
70	LAH(rW1, rD1, 0, 12)
71	LAH(rW2, rD2, 1, 8)
72	LAL(rW2, rD3, 1, 8)
73	LAL(rW3, rD1, 1, 8)
74	LAL(rW4, rD2, 2, 4)
75	LAL(rW6, rD1, 3, 0)
76	LAH(rW5, rD3, 2, 4)
77	LAL(rW5, rD0, 2, 4)
78	LAH(rW7, rD2, 3, 0)
79	evldw		rD1,16(rKP)
80	EAD(rD3, 3)
81	evxor		rW2,rW2,rW4
82	LWL(rW7, 0)
83	evxor		rW2,rW2,rW6
84	EAD(rD2, 0)
85	evxor		rD1,rD1,rW2
86	LWL(rW1, 12)
87	evxor		rD1,rD1,rW0
88	evldw		rD3,24(rKP)
89	evmergehi	rD0,rD0,rD1
90	EAD(rD1, 2)
91	evxor		rW3,rW3,rW5
92	LWH(rW4, 4)
93	evxor		rW3,rW3,rW7
94	EAD(rD0, 3)
95	evxor		rD3,rD3,rW3
96	LWH(rW6, 0)
97	evxor		rD3,rD3,rW1
98	EAD(rD0, 1)
99	evmergehi	rD2,rD2,rD3
100	LWH(rW3, 8)
101	LAH(rW0, rD3, 0, 12)
102	LAL(rW0, rD0, 0, 12)
103	LAH(rW1, rD1, 0, 12)
104	LAH(rW2, rD2, 1, 8)
105	LAL(rW2, rD3, 1, 8)
106	LAL(rW3, rD1, 1, 8)
107	LAL(rW4, rD2, 2, 4)
108	LAL(rW6, rD1, 3, 0)
109	LAH(rW5, rD3, 2, 4)
110	LAL(rW5, rD0, 2, 4)
111	LAH(rW7, rD2, 3, 0)
112	evldw		rD1,32(rKP)
113	EAD(rD3, 3)
114	evxor		rW2,rW2,rW4
115	LWL(rW7, 0)
116	evxor		rW2,rW2,rW6
117	EAD(rD2, 0)
118	evxor		rD1,rD1,rW2
119	LWL(rW1, 12)
120	evxor		rD1,rD1,rW0
121	evldw		rD3,40(rKP)
122	evmergehi	rD0,rD0,rD1
123	EAD(rD1, 2)
124	evxor		rW3,rW3,rW5
125	LWH(rW4, 4)
126	evxor		rW3,rW3,rW7
127	EAD(rD0, 3)
128	evxor		rD3,rD3,rW3
129	LWH(rW6, 0)
130	evxor		rD3,rD3,rW1
131	EAD(rD0, 1)
132	evmergehi	rD2,rD2,rD3
133	LWH(rW3, 8)
134	addi		rKP,rKP,32
135	bdnz		ppc_encrypt_block_loop
136	LAH(rW0, rD3, 0, 12)
137	LAL(rW0, rD0, 0, 12)
138	LAH(rW1, rD1, 0, 12)
139	LAH(rW2, rD2, 1, 8)
140	LAL(rW2, rD3, 1, 8)
141	LAL(rW3, rD1, 1, 8)
142	LAL(rW4, rD2, 2, 4)
143	LAH(rW5, rD3, 2, 4)
144	LAL(rW6, rD1, 3, 0)
145	LAL(rW5, rD0, 2, 4)
146	LAH(rW7, rD2, 3, 0)
147	evldw		rD1,16(rKP)
148	EAD(rD3, 3)
149	evxor		rW2,rW2,rW4
150	LWL(rW7, 0)
151	evxor		rW2,rW2,rW6
152	EAD(rD2, 0)
153	evxor		rD1,rD1,rW2
154	LWL(rW1, 12)
155	evxor		rD1,rD1,rW0
156	evldw		rD3,24(rKP)
157	evmergehi	rD0,rD0,rD1
158	EAD(rD1, 0)
159	evxor		rW3,rW3,rW5
160	LBE(rW2)
161	evxor		rW3,rW3,rW7
162	EAD(rD0, 1)
163	evxor		rD3,rD3,rW3
164	LBE(rW6)
165	evxor		rD3,rD3,rW1
166	EAD(rD0, 0)
167	evmergehi	rD2,rD2,rD3
168	LBE(rW1)
169	LAE(rW0, rD3, 0)
170	LAE(rW1, rD0, 0)
171	LAE(rW4, rD2, 1)
172	LAE(rW5, rD3, 1)
173	LAE(rW3, rD2, 0)
174	LAE(rW7, rD1, 1)
175	rlwimi		rW0,rW4,8,16,23
176	rlwimi		rW1,rW5,8,16,23
177	LAE(rW4, rD1, 2)
178	LAE(rW5, rD2, 2)
179	rlwimi		rW2,rW6,8,16,23
180	rlwimi		rW3,rW7,8,16,23
181	LAE(rW6, rD3, 2)
182	LAE(rW7, rD0, 2)
183	rlwimi		rW0,rW4,16,8,15
184	rlwimi		rW1,rW5,16,8,15
185	LAE(rW4, rD0, 3)
186	LAE(rW5, rD1, 3)
187	rlwimi		rW2,rW6,16,8,15
188	lwz		rD0,32(rKP)
189	rlwimi		rW3,rW7,16,8,15
190	lwz		rD1,36(rKP)
191	LAE(rW6, rD2, 3)
192	LAE(rW7, rD3, 3)
193	rlwimi		rW0,rW4,24,0,7
194	lwz		rD2,40(rKP)
195	rlwimi		rW1,rW5,24,0,7
196	lwz		rD3,44(rKP)
197	rlwimi		rW2,rW6,24,0,7
198	rlwimi		rW3,rW7,24,0,7
199	blr
200
201/*
202 * ppc_decrypt_block: The central decryption function for a single 16 bytes
203 * block. It does no stack handling or register saving to support fast calls
204 * via bl/blr. It expects that caller has pre-xored input data with first
205 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
206 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
207 * and rW0-rW3 and caller must execute a final xor on the output registers.
208 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
209 *
210 */
211_GLOBAL(ppc_decrypt_block)
212	LAH(rW0, rD1, 0, 12)
213	LAH(rW6, rD0, 3, 0)
214	LAH(rW3, rD0, 1, 8)
215ppc_decrypt_block_loop:
216	LAH(rW1, rD3, 0, 12)
217	LAL(rW0, rD2, 0, 12)
218	LAH(rW2, rD2, 1, 8)
219	LAL(rW2, rD3, 1, 8)
220	LAH(rW4, rD3, 2, 4)
221	LAL(rW4, rD0, 2, 4)
222	LAL(rW6, rD1, 3, 0)
223	LAH(rW5, rD1, 2, 4)
224	LAH(rW7, rD2, 3, 0)
225	LAL(rW7, rD3, 3, 0)
226	LAL(rW3, rD1, 1, 8)
227	evldw		rD1,16(rKP)
228	EAD(rD0, 0)
229	evxor		rW4,rW4,rW6
230	LWL(rW1, 12)
231	evxor		rW0,rW0,rW4
232	EAD(rD2, 2)
233	evxor		rW0,rW0,rW2
234	LWL(rW5, 4)
235	evxor		rD1,rD1,rW0
236	evldw		rD3,24(rKP)
237	evmergehi	rD0,rD0,rD1
238	EAD(rD1, 0)
239	evxor		rW3,rW3,rW7
240	LWH(rW0, 12)
241	evxor		rW3,rW3,rW1
242	EAD(rD0, 3)
243	evxor		rD3,rD3,rW3
244	LWH(rW6, 0)
245	evxor		rD3,rD3,rW5
246	EAD(rD0, 1)
247	evmergehi	rD2,rD2,rD3
248	LWH(rW3, 8)
249	LAH(rW1, rD3, 0, 12)
250	LAL(rW0, rD2, 0, 12)
251	LAH(rW2, rD2, 1, 8)
252	LAL(rW2, rD3, 1, 8)
253	LAH(rW4, rD3, 2, 4)
254	LAL(rW4, rD0, 2, 4)
255	LAL(rW6, rD1, 3, 0)
256	LAH(rW5, rD1, 2, 4)
257	LAH(rW7, rD2, 3, 0)
258	LAL(rW7, rD3, 3, 0)
259	LAL(rW3, rD1, 1, 8)
260	evldw		 rD1,32(rKP)
261	EAD(rD0, 0)
262	evxor		rW4,rW4,rW6
263	LWL(rW1, 12)
264	evxor		rW0,rW0,rW4
265	EAD(rD2, 2)
266	evxor		rW0,rW0,rW2
267	LWL(rW5, 4)
268	evxor		rD1,rD1,rW0
269	evldw		rD3,40(rKP)
270	evmergehi	rD0,rD0,rD1
271	EAD(rD1, 0)
272	evxor		rW3,rW3,rW7
273	LWH(rW0, 12)
274	evxor		rW3,rW3,rW1
275	EAD(rD0, 3)
276	evxor		rD3,rD3,rW3
277	LWH(rW6, 0)
278	evxor		rD3,rD3,rW5
279	EAD(rD0, 1)
280	evmergehi	rD2,rD2,rD3
281	LWH(rW3, 8)
282	addi		rKP,rKP,32
283	bdnz		ppc_decrypt_block_loop
284	LAH(rW1, rD3, 0, 12)
285	LAL(rW0, rD2, 0, 12)
286	LAH(rW2, rD2, 1, 8)
287	LAL(rW2, rD3, 1, 8)
288	LAH(rW4, rD3, 2, 4)
289	LAL(rW4, rD0, 2, 4)
290	LAL(rW6, rD1, 3, 0)
291	LAH(rW5, rD1, 2, 4)
292	LAH(rW7, rD2, 3, 0)
293	LAL(rW7, rD3, 3, 0)
294	LAL(rW3, rD1, 1, 8)
295	evldw		 rD1,16(rKP)
296	EAD(rD0, 0)
297	evxor		rW4,rW4,rW6
298	LWL(rW1, 12)
299	evxor		rW0,rW0,rW4
300	EAD(rD2, 2)
301	evxor		rW0,rW0,rW2
302	LWL(rW5, 4)
303	evxor		rD1,rD1,rW0
304	evldw		rD3,24(rKP)
305	evmergehi	rD0,rD0,rD1
306	DAD(rD1, 0)
307	evxor		rW3,rW3,rW7
308	LBD(rW0)
309	evxor		rW3,rW3,rW1
310	DAD(rD0, 1)
311	evxor		rD3,rD3,rW3
312	LBD(rW6)
313	evxor		rD3,rD3,rW5
314	DAD(rD0, 0)
315	evmergehi	rD2,rD2,rD3
316	LBD(rW3)
317	LAD(rW2, rD3, 0)
318	LAD(rW1, rD2, 0)
319	LAD(rW4, rD2, 1)
320	LAD(rW5, rD3, 1)
321	LAD(rW7, rD1, 1)
322	rlwimi		rW0,rW4,8,16,23
323	rlwimi		rW1,rW5,8,16,23
324	LAD(rW4, rD3, 2)
325	LAD(rW5, rD0, 2)
326	rlwimi		rW2,rW6,8,16,23
327	rlwimi		rW3,rW7,8,16,23
328	LAD(rW6, rD1, 2)
329	LAD(rW7, rD2, 2)
330	rlwimi		rW0,rW4,16,8,15
331	rlwimi		rW1,rW5,16,8,15
332	LAD(rW4, rD0, 3)
333	LAD(rW5, rD1, 3)
334	rlwimi		rW2,rW6,16,8,15
335	lwz		rD0,32(rKP)
336	rlwimi		rW3,rW7,16,8,15
337	lwz		rD1,36(rKP)
338	LAD(rW6, rD2, 3)
339	LAD(rW7, rD3, 3)
340	rlwimi		rW0,rW4,24,0,7
341	lwz		rD2,40(rKP)
342	rlwimi		rW1,rW5,24,0,7
343	lwz		rD3,44(rKP)
344	rlwimi		rW2,rW6,24,0,7
345	rlwimi		rW3,rW7,24,0,7
346	blr
347