xref: /openbmc/linux/arch/sparc/lib/csum_copy.S (revision 8993d5e4)
1/* csum_copy.S: Checksum+copy code for sparc64
2 *
3 * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
4 */
5
6#include <asm/export.h>
7
8#ifdef __KERNEL__
9#define GLOBAL_SPARE	%g7
10#else
11#define GLOBAL_SPARE	%g5
12#endif
13
14#ifndef EX_LD
15#define EX_LD(x)	x
16#endif
17
18#ifndef EX_ST
19#define EX_ST(x)	x
20#endif
21
22#ifndef EX_RETVAL
23#define EX_RETVAL(x)	x
24#endif
25
26#ifndef LOAD
27#define LOAD(type,addr,dest)	type [addr], dest
28#endif
29
30#ifndef STORE
31#define STORE(type,src,addr)	type src, [addr]
32#endif
33
34#ifndef FUNC_NAME
35#define FUNC_NAME	csum_partial_copy_nocheck
36#endif
37
38	.register	%g2, #scratch
39	.register	%g3, #scratch
40
41	.text
42
4390:
44	/* We checked for zero length already, so there must be
45	 * at least one byte.
46	 */
47	be,pt		%icc, 1f
48	 nop
49	EX_LD(LOAD(ldub, %o0 + 0x00, %o4))
50	add		%o0, 1, %o0
51	sub		%o2, 1, %o2
52	EX_ST(STORE(stb, %o4, %o1 + 0x00))
53	add		%o1, 1, %o1
541:	andcc		%o0, 0x2, %g0
55	be,pn		%icc, 80f
56	 cmp		%o2, 2
57	blu,pn		%icc, 60f
58	 nop
59	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
60	add		%o0, 2, %o0
61	sub		%o2, 2, %o2
62	EX_ST(STORE(sth, %o5, %o1 + 0x00))
63	add		%o1, 2, %o1
64	ba,pt		%xcc, 80f
65	 add		%o5, %o4, %o4
66
67	.globl		FUNC_NAME
68	.type		FUNC_NAME,#function
69	EXPORT_SYMBOL(FUNC_NAME)
70FUNC_NAME:		/* %o0=src, %o1=dst, %o2=len, %o3=sum */
71	LOAD(prefetch, %o0 + 0x000, #n_reads)
72	xor		%o0, %o1, %g1
73	clr		%o4
74	andcc		%g1, 0x3, %g0
75	bne,pn		%icc, 95f
76	 LOAD(prefetch, %o0 + 0x040, #n_reads)
77
78	brz,pn		%o2, 70f
79	 andcc		%o0, 0x3, %g0
80
81	/* We "remember" whether the lowest bit in the address
82	 * was set in GLOBAL_SPARE.  Because if it is, we have to swap
83	 * upper and lower 8 bit fields of the sum we calculate.
84	*/
85	bne,pn		%icc, 90b
86	 andcc		%o0, 0x1, GLOBAL_SPARE
87
8880:
89	LOAD(prefetch, %o0 + 0x080, #n_reads)
90	andncc		%o2, 0x3f, %g3
91
92	LOAD(prefetch, %o0 + 0x0c0, #n_reads)
93	sub		%o2, %g3, %o2
94	brz,pn		%g3, 2f
95	 LOAD(prefetch, %o0 + 0x100, #n_reads)
96
97	/* So that we don't need to use the non-pairing
98	 * add-with-carry instructions we accumulate 32-bit
99	 * values into a 64-bit register.  At the end of the
100	 * loop we fold it down to 32-bits and so on.
101	 */
102	ba,pt		%xcc, 1f
103	LOAD(prefetch, %o0 + 0x140, #n_reads)
104
105	.align		32
1061:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
107	EX_LD(LOAD(lduw, %o0 + 0x04, %g1))
108	EX_LD(LOAD(lduw, %o0 + 0x08, %g2))
109	add		%o4, %o5, %o4
110	EX_ST(STORE(stw, %o5, %o1 + 0x00))
111	EX_LD(LOAD(lduw, %o0 + 0x0c, %o5))
112	add		%o4, %g1, %o4
113	EX_ST(STORE(stw, %g1, %o1 + 0x04))
114	EX_LD(LOAD(lduw, %o0 + 0x10, %g1))
115	add		%o4, %g2, %o4
116	EX_ST(STORE(stw, %g2, %o1 + 0x08))
117	EX_LD(LOAD(lduw, %o0 + 0x14, %g2))
118	add		%o4, %o5, %o4
119	EX_ST(STORE(stw, %o5, %o1 + 0x0c))
120	EX_LD(LOAD(lduw, %o0 + 0x18, %o5))
121	add		%o4, %g1, %o4
122	EX_ST(STORE(stw, %g1, %o1 + 0x10))
123	EX_LD(LOAD(lduw, %o0 + 0x1c, %g1))
124	add		%o4, %g2, %o4
125	EX_ST(STORE(stw, %g2, %o1 + 0x14))
126	EX_LD(LOAD(lduw, %o0 + 0x20, %g2))
127	add		%o4, %o5, %o4
128	EX_ST(STORE(stw, %o5, %o1 + 0x18))
129	EX_LD(LOAD(lduw, %o0 + 0x24, %o5))
130	add		%o4, %g1, %o4
131	EX_ST(STORE(stw, %g1, %o1 + 0x1c))
132	EX_LD(LOAD(lduw, %o0 + 0x28, %g1))
133	add		%o4, %g2, %o4
134	EX_ST(STORE(stw, %g2, %o1 + 0x20))
135	EX_LD(LOAD(lduw, %o0 + 0x2c, %g2))
136	add		%o4, %o5, %o4
137	EX_ST(STORE(stw, %o5, %o1 + 0x24))
138	EX_LD(LOAD(lduw, %o0 + 0x30, %o5))
139	add		%o4, %g1, %o4
140	EX_ST(STORE(stw, %g1, %o1 + 0x28))
141	EX_LD(LOAD(lduw, %o0 + 0x34, %g1))
142	add		%o4, %g2, %o4
143	EX_ST(STORE(stw, %g2, %o1 + 0x2c))
144	EX_LD(LOAD(lduw, %o0 + 0x38, %g2))
145	add		%o4, %o5, %o4
146	EX_ST(STORE(stw, %o5, %o1 + 0x30))
147	EX_LD(LOAD(lduw, %o0 + 0x3c, %o5))
148	add		%o4, %g1, %o4
149	EX_ST(STORE(stw, %g1, %o1 + 0x34))
150	LOAD(prefetch, %o0 + 0x180, #n_reads)
151	add		%o4, %g2, %o4
152	EX_ST(STORE(stw, %g2, %o1 + 0x38))
153	subcc		%g3, 0x40, %g3
154	add		%o0, 0x40, %o0
155	add		%o4, %o5, %o4
156	EX_ST(STORE(stw, %o5, %o1 + 0x3c))
157	bne,pt		%icc, 1b
158	 add		%o1, 0x40, %o1
159
1602:	and		%o2, 0x3c, %g3
161	brz,pn		%g3, 2f
162	 sub		%o2, %g3, %o2
1631:	EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
164	subcc		%g3, 0x4, %g3
165	add		%o0, 0x4, %o0
166	add		%o4, %o5, %o4
167	EX_ST(STORE(stw, %o5, %o1 + 0x00))
168	bne,pt		%icc, 1b
169	 add		%o1, 0x4, %o1
170
1712:
172	/* fold 64-->32 */
173	srlx		%o4, 32, %o5
174	srl		%o4, 0, %o4
175	add		%o4, %o5, %o4
176	srlx		%o4, 32, %o5
177	srl		%o4, 0, %o4
178	add		%o4, %o5, %o4
179
180	/* fold 32-->16 */
181	sethi		%hi(0xffff0000), %g1
182	srl		%o4, 16, %o5
183	andn		%o4, %g1, %g2
184	add		%o5, %g2, %o4
185	srl		%o4, 16, %o5
186	andn		%o4, %g1, %g2
187	add		%o5, %g2, %o4
188
18960:
190	/* %o4 has the 16-bit sum we have calculated so-far.  */
191	cmp		%o2, 2
192	blu,pt		%icc, 1f
193	 nop
194	EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
195	sub		%o2, 2, %o2
196	add		%o0, 2, %o0
197	add		%o4, %o5, %o4
198	EX_ST(STORE(sth, %o5, %o1 + 0x00))
199	add		%o1, 0x2, %o1
2001:	brz,pt		%o2, 1f
201	 nop
202	EX_LD(LOAD(ldub, %o0 + 0x00, %o5))
203	sub		%o2, 1, %o2
204	add		%o0, 1, %o0
205	EX_ST(STORE(stb, %o5, %o1 + 0x00))
206	sllx		%o5, 8, %o5
207	add		%o1, 1, %o1
208	add		%o4, %o5, %o4
2091:
210	/* fold 32-->16 */
211	sethi		%hi(0xffff0000), %g1
212	srl		%o4, 16, %o5
213	andn		%o4, %g1, %g2
214	add		%o5, %g2, %o4
215	srl		%o4, 16, %o5
216	andn		%o4, %g1, %g2
217	add		%o5, %g2, %o4
218
2191:	brz,pt		GLOBAL_SPARE, 1f
220	 nop
221
222	/* We started with an odd byte, byte-swap the result.  */
223	srl		%o4, 8, %o5
224	and		%o4, 0xff, %g1
225	sll		%g1, 8, %g1
226	or		%o5, %g1, %o4
227
2281:	addcc		%o3, %o4, %o3
229	addc		%g0, %o3, %o3
230
23170:
232	retl
233	 srl		%o3, 0, %o0
234
23595:	mov		0, GLOBAL_SPARE
236	brlez,pn	%o2, 4f
237	 andcc		%o0, 1, %o5
238	be,a,pt		%icc, 1f
239	 srl		%o2, 1, %g1
240	sub		%o2, 1, %o2
241	EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE))
242	add		%o0, 1, %o0
243	EX_ST(STORE(stb, GLOBAL_SPARE, %o1))
244	srl		%o2, 1, %g1
245	add		%o1, 1, %o1
2461:	brz,a,pn	%g1, 3f
247	 andcc		%o2, 1, %g0
248	andcc		%o0, 2, %g0
249	be,a,pt		%icc, 1f
250	 srl		%g1, 1, %g1
251	EX_LD(LOAD(lduh, %o0, %o4))
252	sub		%o2, 2, %o2
253	srl		%o4, 8, %g2
254	sub		%g1, 1, %g1
255	EX_ST(STORE(stb, %g2, %o1))
256	add		%o4, GLOBAL_SPARE, GLOBAL_SPARE
257	EX_ST(STORE(stb, %o4, %o1 + 1))
258	add		%o0, 2, %o0
259	srl		%g1, 1, %g1
260	add		%o1, 2, %o1
2611:	brz,a,pn	%g1, 2f
262	 andcc		%o2, 2, %g0
263	EX_LD(LOAD(lduw, %o0, %o4))
2645:	srl		%o4, 24, %g2
265	srl		%o4, 16, %g3
266	EX_ST(STORE(stb, %g2, %o1))
267	srl		%o4, 8, %g2
268	EX_ST(STORE(stb, %g3, %o1 + 1))
269	add		%o0, 4, %o0
270	EX_ST(STORE(stb, %g2, %o1 + 2))
271	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
272	EX_ST(STORE(stb, %o4, %o1 + 3))
273	addc		GLOBAL_SPARE, %g0, GLOBAL_SPARE
274	add		%o1, 4, %o1
275	subcc		%g1, 1, %g1
276	bne,a,pt	%icc, 5b
277	 EX_LD(LOAD(lduw, %o0, %o4))
278	sll		GLOBAL_SPARE, 16, %g2
279	srl		GLOBAL_SPARE, 16, GLOBAL_SPARE
280	srl		%g2, 16, %g2
281	andcc		%o2, 2, %g0
282	add		%g2, GLOBAL_SPARE, GLOBAL_SPARE
2832:	be,a,pt		%icc, 3f
284	 andcc		%o2, 1, %g0
285	EX_LD(LOAD(lduh, %o0, %o4))
286	andcc		%o2, 1, %g0
287	srl		%o4, 8, %g2
288	add		%o0, 2, %o0
289	EX_ST(STORE(stb, %g2, %o1))
290	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
291	EX_ST(STORE(stb, %o4, %o1 + 1))
292	add		%o1, 2, %o1
2933:	be,a,pt		%icc, 1f
294	 sll		GLOBAL_SPARE, 16, %o4
295	EX_LD(LOAD(ldub, %o0, %g2))
296	sll		%g2, 8, %o4
297	EX_ST(STORE(stb, %g2, %o1))
298	add		GLOBAL_SPARE, %o4, GLOBAL_SPARE
299	sll		GLOBAL_SPARE, 16, %o4
3001:	addcc		%o4, GLOBAL_SPARE, GLOBAL_SPARE
301	srl		GLOBAL_SPARE, 16, %o4
302	addc		%g0, %o4, GLOBAL_SPARE
303	brz,pt		%o5, 4f
304	 srl		GLOBAL_SPARE, 8, %o4
305	and		GLOBAL_SPARE, 0xff, %g2
306	and		%o4, 0xff, %o4
307	sll		%g2, 8, %g2
308	or		%g2, %o4, GLOBAL_SPARE
3094:	addcc		%o3, GLOBAL_SPARE, %o3
310	addc		%g0, %o3, %o0
311	retl
312	 srl		%o0, 0, %o0
313	.size		FUNC_NAME, .-FUNC_NAME
314