1/*
2 * Fast SHA-1 implementation for SPE instruction set (PPC)
3 *
4 * This code makes use of the SPE SIMD instruction set as defined in
5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6 * Implementation is based on optimization guide notes from
7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
8 *
9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 */
17
18#include <asm/ppc_asm.h>
19#include <asm/asm-offsets.h>
20
21#define rHP	r3	/* pointer to hash value			*/
22#define rWP	r4	/* pointer to input				*/
23#define rKP	r5	/* pointer to constants				*/
24
25#define rW0	r14	/* 64 bit round words				*/
26#define rW1	r15
27#define rW2	r16
28#define rW3	r17
29#define rW4	r18
30#define rW5	r19
31#define rW6	r20
32#define rW7	r21
33
34#define rH0	r6	/* 32 bit hash values 				*/
35#define rH1	r7
36#define rH2	r8
37#define rH3	r9
38#define rH4	r10
39
40#define rT0	r22	/* 64 bit temporary				*/
41#define rT1	r0	/* 32 bit temporaries				*/
42#define rT2	r11
43#define rT3	r12
44
45#define rK	r23	/* 64 bit constant in volatile register		*/
46
47#define LOAD_K01
48
49#define LOAD_K11 \
50	evlwwsplat	rK,0(rKP);
51
52#define LOAD_K21 \
53	evlwwsplat	rK,4(rKP);
54
55#define LOAD_K31 \
56	evlwwsplat	rK,8(rKP);
57
58#define LOAD_K41 \
59	evlwwsplat	rK,12(rKP);
60
61#define INITIALIZE \
62	stwu		r1,-128(r1);	/* create stack frame		*/ \
63	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
64	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
65	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
66	evstdw		r17,32(r1);					   \
67	evstdw		r18,40(r1);					   \
68	evstdw		r19,48(r1);					   \
69	evstdw		r20,56(r1);					   \
70	evstdw		r21,64(r1);					   \
71	evstdw		r22,72(r1);					   \
72	evstdw		r23,80(r1);
73
74
75#define FINALIZE \
76	evldw		r14,8(r1);	/* restore SPE registers	*/ \
77	evldw		r15,16(r1);					   \
78	evldw		r16,24(r1);					   \
79	evldw		r17,32(r1);					   \
80	evldw		r18,40(r1);					   \
81	evldw		r19,48(r1);					   \
82	evldw		r20,56(r1);					   \
83	evldw		r21,64(r1);					   \
84	evldw		r22,72(r1);					   \
85	evldw		r23,80(r1);					   \
86	xor		r0,r0,r0;					   \
87	stw		r0,8(r1);	/* Delete sensitive data	*/ \
88	stw		r0,16(r1);	/* that we might have pushed	*/ \
89	stw		r0,24(r1);	/* from other context that runs	*/ \
90	stw		r0,32(r1);	/* the same code. Assume that	*/ \
91	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
92	stw		r0,48(r1);	/* were already overwritten on	*/ \
93	stw		r0,56(r1);	/* the way down to here		*/ \
94	stw		r0,64(r1);					   \
95	stw		r0,72(r1);					   \
96	stw		r0,80(r1);					   \
97	addi		r1,r1,128;	/* cleanup stack frame		*/
98
99#ifdef __BIG_ENDIAN__
100#define LOAD_DATA(reg, off) \
101	lwz		reg,off(rWP);	/* load data			*/
102#define NEXT_BLOCK \
103	addi		rWP,rWP,64;	/* increment per block		*/
104#else
105#define LOAD_DATA(reg, off) \
106	lwbrx		reg,0,rWP;	/* load data			*/ \
107	addi		rWP,rWP,4;	/* increment per word		*/
108#define NEXT_BLOCK			/* nothing to do		*/
109#endif
110
111#define	R_00_15(a, b, c, d, e, w0, w1, k, off) \
112	LOAD_DATA(w0, off)		/* 1: W				*/ \
113	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
114	LOAD_K##k##1							   \
115	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
116	rotrwi		rT0,a,27;	/* 1: A' = A rotl 5		*/ \
117	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
118	add		e,e,rT0;	/* 1: E = E + A'		*/ \
119	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
120	add		e,e,w0;		/* 1: E = E + W			*/ \
121	LOAD_DATA(w1, off+4)		/* 2: W				*/ \
122	add		e,e,rT2;	/* 1: E = E + F			*/ \
123	and		rT1,a,b;	/* 2: F' = B and C 		*/ \
124	add		e,e,rK;		/* 1: E = E + K			*/ \
125	andc		rT2,c,a;	/* 2: F" = ~B and D 		*/ \
126	add		d,d,rK;		/* 2: E = E + K			*/ \
127	or		rT2,rT2,rT1;	/* 2: F = F' or F"		*/ \
128	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
129	add		d,d,w1;		/* 2: E = E + W			*/ \
130	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
131	add		d,d,rT0;	/* 2: E = E + A'		*/ \
132	evmergelo	w1,w1,w0;	/*    mix W[0]/W[1]		*/ \
133	add		d,d,rT2		/* 2: E = E + F			*/
134
135#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
136	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
137	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
138	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
139	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
140	or		rT1,rT1,rT2;	/* 1: F = F' or F"		*/ \
141	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
142	add		e,e,rT1;	/* 1: E = E + F			*/ \
143	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
144	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
145	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
146	add		e,e,rT2;	/* 1: E = E + A'		*/ \
147	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
148	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
149	LOAD_K##k##1							   \
150	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
151	add		e,e,rT0;	/* 1: E = E + WK		*/ \
152	add		d,d,rT1;	/* 2: E = E + WK		*/ \
153	and		rT2,a,b;	/* 2: F' = B and C 		*/ \
154	andc		rT1,c,a;	/* 2: F" = ~B and D 		*/ \
155	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
156	or		rT1,rT1,rT2;	/* 2: F = F' or F"		*/ \
157	add		d,d,rT0;	/* 2: E = E + A'		*/ \
158	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
159	add		d,d,rT1		/* 2: E = E + F			*/
160
161#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
162	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
163	xor		rT2,b,c;	/* 1: F' = B xor C		*/ \
164	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
165	xor		rT2,rT2,d;	/* 1: F = F' xor D		*/ \
166	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
167	add		e,e,rT2;	/* 1: E = E + F			*/ \
168	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
169	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
170	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
171	add		e,e,rT2;	/* 1: E = E + A'		*/ \
172	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
173	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
174	LOAD_K##k##1							   \
175	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
176	add		e,e,rT0;	/* 1: E = E + WK		*/ \
177	xor		rT2,a,b;	/* 2: F' = B xor C		*/ \
178	add		d,d,rT1;	/* 2: E = E + WK		*/ \
179	xor		rT2,rT2,c;	/* 2: F = F' xor D		*/ \
180	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
181	add		d,d,rT2;	/* 2: E = E + F			*/ \
182	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
183	add		d,d,rT0		/* 2: E = E + A'		*/
184
185#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
186	and		rT2,b,c;	/* 1: F' = B and C		*/ \
187	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
188	or		rT1,b,c;	/* 1: F" = B or C		*/ \
189	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
190	and		rT1,d,rT1;	/* 1: F" = F" and D		*/ \
191	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
192	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
193	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
194	add		e,e,rT2;	/* 1: E = E + F			*/ \
195	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
196	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
197	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
198	add		e,e,rT2;	/* 1: E = E + A'		*/ \
199	LOAD_K##k##1							   \
200	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
201	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
202	add		e,e,rT0;	/* 1: E = E + WK		*/ \
203	and		rT2,a,b;	/* 2: F' = B and C		*/ \
204	or		rT0,a,b;	/* 2: F" = B or C		*/ \
205	add		d,d,rT1;	/* 2: E = E + WK		*/ \
206	and		rT0,c,rT0;	/* 2: F" = F" and D		*/ \
207	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
208	or		rT2,rT2,rT0;	/* 2: F = F' or F"		*/ \
209	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
210	add		d,d,rT2;	/* 2: E = E + F			*/ \
211	add		d,d,rT0		/* 2: E = E + A'		*/
212
213#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
214	R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
215
216_GLOBAL(ppc_spe_sha1_transform)
217	INITIALIZE
218
219	lwz		rH0,0(rHP)
220	lwz		rH1,4(rHP)
221	mtctr		r5
222	lwz		rH2,8(rHP)
223	lis		rKP,PPC_SPE_SHA1_K@h
224	lwz		rH3,12(rHP)
225	ori		rKP,rKP,PPC_SPE_SHA1_K@l
226	lwz		rH4,16(rHP)
227
228ppc_spe_sha1_main:
229	R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
230	R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
231	R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
232	R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
233	R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
234	R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
235	R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
236	R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
237
238	R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
239	R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
240
241	R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
242	R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
243	R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
244	R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
245	R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
246	R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
247	R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
248	R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
249	R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
250	R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
251
252	R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
253	R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
254	R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
255	R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
256	R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
257	R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
258	R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
259	R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
260	R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
261	R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
262
263	R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
264	R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
265	R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
266	R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
267	R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
268	R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
269	R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
270	lwz		rT3,0(rHP)
271	R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
272	lwz		rW1,4(rHP)
273	R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
274	lwz		rW2,8(rHP)
275	R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
276	lwz		rW3,12(rHP)
277	NEXT_BLOCK
278	lwz		rW4,16(rHP)
279
280	add		rH0,rH0,rT3
281	stw		rH0,0(rHP)
282	add		rH1,rH1,rW1
283	stw		rH1,4(rHP)
284	add		rH2,rH2,rW2
285	stw		rH2,8(rHP)
286	add		rH3,rH3,rW3
287	stw		rH3,12(rHP)
288	add		rH4,rH4,rW4
289	stw		rH4,16(rHP)
290
291	bdnz		ppc_spe_sha1_main
292
293	FINALIZE
294	blr
295
296.data
297.align 4
298PPC_SPE_SHA1_K:
299	.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
300