1/*
2 * Fast SHA-256 implementation for SPE instruction set (PPC)
3 *
4 * This code makes use of the SPE SIMD instruction set as defined in
5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6 * Implementation is based on optimization guide notes from
7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
8 *
9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 */
17
18#include <asm/ppc_asm.h>
19#include <asm/asm-offsets.h>
20
21#define rHP	r3	/* pointer to hash values in memory		*/
22#define rKP	r24	/* pointer to round constants			*/
23#define rWP	r4	/* pointer to input data			*/
24
25#define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
26#define rH1	r6
27#define rH2	r7
28#define rH3	r8
29#define rH4	r9
30#define rH5	r10
31#define rH6	r11
32#define rH7	r12
33
34#define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
35#define rW1	r15
36#define rW2	r16
37#define rW3	r17
38#define rW4	r18
39#define rW5	r19
40#define rW6	r20
41#define rW7	r21
42
43#define rT0	r22	/* 64 bit temporaries 				*/
44#define rT1	r23
45#define rT2	r0	/* 32 bit temporaries				*/
46#define rT3	r25
47
48#define CMP_KN_LOOP
49#define CMP_KC_LOOP \
50	cmpwi		rT1,0;
51
52#define INITIALIZE \
53	stwu		r1,-128(r1);	/* create stack frame		*/ \
54	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
55	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
56	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
57	evstdw		r17,32(r1);					   \
58	evstdw		r18,40(r1);					   \
59	evstdw		r19,48(r1);					   \
60	evstdw		r20,56(r1);					   \
61	evstdw		r21,64(r1);					   \
62	evstdw		r22,72(r1);					   \
63	evstdw		r23,80(r1);					   \
64	stw		r24,88(r1);	/* save normal registers	*/ \
65	stw		r25,92(r1);
66
67
68#define FINALIZE \
69	evldw		r14,8(r1);	/* restore SPE registers	*/ \
70	evldw		r15,16(r1);					   \
71	evldw		r16,24(r1);					   \
72	evldw		r17,32(r1);					   \
73	evldw		r18,40(r1);					   \
74	evldw		r19,48(r1);					   \
75	evldw		r20,56(r1);					   \
76	evldw		r21,64(r1);					   \
77	evldw		r22,72(r1);					   \
78	evldw		r23,80(r1);					   \
79	lwz		r24,88(r1);	/* restore normal registers	*/ \
80	lwz		r25,92(r1);					   \
81	xor		r0,r0,r0;					   \
82	stw		r0,8(r1);	/* Delete sensitive data	*/ \
83	stw		r0,16(r1);	/* that we might have pushed	*/ \
84	stw		r0,24(r1);	/* from other context that runs	*/ \
85	stw		r0,32(r1);	/* the same code. Assume that	*/ \
86	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
87	stw		r0,48(r1);	/* was already overwritten on	*/ \
88	stw		r0,56(r1);	/* the way down to here		*/ \
89	stw		r0,64(r1);					   \
90	stw		r0,72(r1);					   \
91	stw		r0,80(r1);					   \
92	addi		r1,r1,128;	/* cleanup stack frame		*/
93
94#ifdef __BIG_ENDIAN__
95#define LOAD_DATA(reg, off) \
96	lwz		reg,off(rWP);	/* load data			*/
97#define NEXT_BLOCK \
98	addi		rWP,rWP,64;	/* increment per block		*/
99#else
100#define LOAD_DATA(reg, off) \
101	lwbrx		reg,0,rWP; 	/* load data			*/ \
102	addi		rWP,rWP,4;	/* increment per word		*/
103#define NEXT_BLOCK			/* nothing to do		*/
104#endif
105
106#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
107	LOAD_DATA(w, off)		/* 1: W				*/ \
108	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
109	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
110	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
111	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
112	and		rT3,e,f;	/* 1: ch = e and f		*/ \
113	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
114	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
115	lwz		rT2,off(rKP);	/* 1: K				*/ \
116	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
117	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
118	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
119	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
120	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
121	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
122	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
123	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
124	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
125	add		d,d,h;		/* 1: d = d + temp1		*/ \
126	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
127	evmergelo	w,w,w;		/*    shift W			*/ \
128	or		rT2,a,b;	/* 1: maj = a or b		*/ \
129	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
130	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
131	LOAD_DATA(w, off+4)		/* 2: W				*/ \
132	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
133	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
134	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
135	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
136	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
137	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
138	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
139	and		rT3,d,e;	/* 2: ch = e and f		*/ \
140	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
141	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
142	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
143	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
144	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
145	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
146	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
147	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
148	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
149	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
150	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
151	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
152	or		rT2,h,a;	/* 2: maj = a or b		*/ \
153	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
154	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
155	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
156	add		c,c,g;		/* 2: d = d + temp1		*/ \
157	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
158	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
159	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
160
161#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
162	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
163	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
164	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
165	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
166	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
167	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
168	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
169	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
170	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
171	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
172	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
173	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
174	and		rT2,e,f;	/* 1: ch = e and f		*/ \
175	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
176	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
177	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
178	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
179	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
180	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
181	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
182	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
183	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
184	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
185	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
186	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
187	evldw		rT1,off(rKP);	/*    k				*/ \
188	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
189	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
190	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
191	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
192	and		rT3,a,b;	/* 1: maj = a and b		*/ \
193	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
194	CMP_K##k##_LOOP							   \
195	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
196	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
197	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
198	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
199	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
200	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
201	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
202	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
203	add		d,d,h;		/* 1: d = d + temp1		*/ \
204	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
205	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
206	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
207	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
208	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
209	and		rT3,d,e;	/* 2: ch = e and f		*/ \
210	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
211	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
212	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
213	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
214	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
215	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
216	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
217	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
218	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
219	or		rT2,h,a;	/* 2: maj = a or b		*/ \
220	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
221	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
222	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
223	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
224	add		c,c,g;		/* 2: d = d + temp1		*/ \
225	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
226	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
227
228_GLOBAL(ppc_spe_sha256_transform)
229	INITIALIZE
230
231	mtctr		r5
232	lwz		rH0,0(rHP)
233	lwz		rH1,4(rHP)
234	lwz		rH2,8(rHP)
235	lwz		rH3,12(rHP)
236	lwz		rH4,16(rHP)
237	lwz		rH5,20(rHP)
238	lwz		rH6,24(rHP)
239	lwz		rH7,28(rHP)
240
241ppc_spe_sha256_main:
242	lis		rKP,PPC_SPE_SHA256_K@ha
243	addi		rKP,rKP,PPC_SPE_SHA256_K@l
244
245	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
246	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
247	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
248	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
249	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
250	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
251	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
252	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
253ppc_spe_sha256_16_rounds:
254	addi		rKP,rKP,64
255	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
256		 rW0, rW1, rW4, rW5, rW7, N, 0)
257	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
258		 rW1, rW2, rW5, rW6, rW0, N, 8)
259	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
260		 rW2, rW3, rW6, rW7, rW1, N, 16)
261	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
262		 rW3, rW4, rW7, rW0, rW2, N, 24)
263	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
264		 rW4, rW5, rW0, rW1, rW3, N, 32)
265	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
266		 rW5, rW6, rW1, rW2, rW4, N, 40)
267	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
268		 rW6, rW7, rW2, rW3, rW5, N, 48)
269	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
270		 rW7, rW0, rW3, rW4, rW6, C, 56)
271	bt		gt,ppc_spe_sha256_16_rounds
272
273	lwz		rW0,0(rHP)
274	NEXT_BLOCK
275	lwz		rW1,4(rHP)
276	lwz		rW2,8(rHP)
277	lwz		rW3,12(rHP)
278	lwz		rW4,16(rHP)
279	lwz		rW5,20(rHP)
280	lwz		rW6,24(rHP)
281	lwz		rW7,28(rHP)
282
283	add		rH0,rH0,rW0
284	stw		rH0,0(rHP)
285	add		rH1,rH1,rW1
286	stw		rH1,4(rHP)
287	add		rH2,rH2,rW2
288	stw		rH2,8(rHP)
289	add		rH3,rH3,rW3
290	stw		rH3,12(rHP)
291	add		rH4,rH4,rW4
292	stw		rH4,16(rHP)
293	add		rH5,rH5,rW5
294	stw		rH5,20(rHP)
295	add		rH6,rH6,rW6
296	stw		rH6,24(rHP)
297	add		rH7,rH7,rW7
298	stw		rH7,28(rHP)
299
300	bdnz		ppc_spe_sha256_main
301
302	FINALIZE
303	blr
304
305.data
306.align 5
307PPC_SPE_SHA256_K:
308	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
309	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
310	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
311	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
312	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
313	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
314	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
315	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
316	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
317	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
318	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
319	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
320	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
321	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
322	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
323	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
324