xref: /openbmc/linux/arch/powerpc/lib/checksum_32.S (revision 023e4163)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/cache.h>
18#include <asm/errno.h>
19#include <asm/ppc_asm.h>
20#include <asm/export.h>
21
22	.text
23
24/*
25 * computes the checksum of a memory block at buff, length len,
26 * and adds in "sum" (32-bit)
27 *
28 * __csum_partial(buff, len, sum)
29 */
30_GLOBAL(__csum_partial)
31	subi	r3,r3,4
32	srawi.	r6,r4,2		/* Divide len by 4 and also clear carry */
33	beq	3f		/* if we're doing < 4 bytes */
34	andi.	r0,r3,2		/* Align buffer to longword boundary */
35	beq+	1f
36	lhz	r0,4(r3)	/* do 2 bytes to get aligned */
37	subi	r4,r4,2
38	addi	r3,r3,2
39	srwi.	r6,r4,2		/* # words to do */
40	adde	r5,r5,r0
41	beq	3f
421:	andi.	r6,r6,3		/* Prepare to handle words 4 by 4 */
43	beq	21f
44	mtctr	r6
452:	lwzu	r0,4(r3)
46	adde	r5,r5,r0
47	bdnz	2b
4821:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
49	beq	3f
50	lwz	r0,4(r3)
51	mtctr	r6
52	lwz	r6,8(r3)
53	adde	r5,r5,r0
54	lwz	r7,12(r3)
55	adde	r5,r5,r6
56	lwzu	r8,16(r3)
57	adde	r5,r5,r7
58	bdz	23f
5922:	lwz	r0,4(r3)
60	adde	r5,r5,r8
61	lwz	r6,8(r3)
62	adde	r5,r5,r0
63	lwz	r7,12(r3)
64	adde	r5,r5,r6
65	lwzu	r8,16(r3)
66	adde	r5,r5,r7
67	bdnz	22b
6823:	adde	r5,r5,r8
693:	andi.	r0,r4,2
70	beq+	4f
71	lhz	r0,4(r3)
72	addi	r3,r3,2
73	adde	r5,r5,r0
744:	andi.	r0,r4,1
75	beq+	5f
76	lbz	r0,4(r3)
77	slwi	r0,r0,8		/* Upper byte of word */
78	adde	r5,r5,r0
795:	addze	r3,r5		/* add in final carry */
80	blr
81EXPORT_SYMBOL(__csum_partial)
82
83/*
84 * Computes the checksum of a memory block at src, length len,
85 * and adds in "sum" (32-bit), while copying the block to dst.
86 * If an access exception occurs on src or dst, it stores -EFAULT
87 * to *src_err or *dst_err respectively, and (for an error on
88 * src) zeroes the rest of dst.
89 *
90 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
91 */
92#define CSUM_COPY_16_BYTES_WITHEX(n)	\
938 ## n ## 0:			\
94	lwz	r7,4(r4);	\
958 ## n ## 1:			\
96	lwz	r8,8(r4);	\
978 ## n ## 2:			\
98	lwz	r9,12(r4);	\
998 ## n ## 3:			\
100	lwzu	r10,16(r4);	\
1018 ## n ## 4:			\
102	stw	r7,4(r6);	\
103	adde	r12,r12,r7;	\
1048 ## n ## 5:			\
105	stw	r8,8(r6);	\
106	adde	r12,r12,r8;	\
1078 ## n ## 6:			\
108	stw	r9,12(r6);	\
109	adde	r12,r12,r9;	\
1108 ## n ## 7:			\
111	stwu	r10,16(r6);	\
112	adde	r12,r12,r10
113
114#define CSUM_COPY_16_BYTES_EXCODE(n)		\
115	EX_TABLE(8 ## n ## 0b, src_error);	\
116	EX_TABLE(8 ## n ## 1b, src_error);	\
117	EX_TABLE(8 ## n ## 2b, src_error);	\
118	EX_TABLE(8 ## n ## 3b, src_error);	\
119	EX_TABLE(8 ## n ## 4b, dst_error);	\
120	EX_TABLE(8 ## n ## 5b, dst_error);	\
121	EX_TABLE(8 ## n ## 6b, dst_error);	\
122	EX_TABLE(8 ## n ## 7b, dst_error);
123
124	.text
125	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
126	.stabs	"checksum_32.S",N_SO,0,0,0f
1270:
128
129CACHELINE_BYTES = L1_CACHE_BYTES
130LG_CACHELINE_BYTES = L1_CACHE_SHIFT
131CACHELINE_MASK = (L1_CACHE_BYTES-1)
132
133_GLOBAL(csum_partial_copy_generic)
134	stwu	r1,-16(r1)
135	stw	r7,12(r1)
136	stw	r8,8(r1)
137
138	addic	r12,r6,0
139	addi	r6,r4,-4
140	neg	r0,r4
141	addi	r4,r3,-4
142	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
143	crset	4*cr7+eq
144	beq	58f
145
146	cmplw	0,r5,r0			/* is this more than total to do? */
147	blt	63f			/* if not much to do */
148	rlwinm	r7,r6,3,0x8
149	rlwnm	r12,r12,r7,0,31	/* odd destination address: rotate one byte */
150	cmplwi	cr7,r7,0	/* is destination address even ? */
151	andi.	r8,r0,3			/* get it word-aligned first */
152	mtctr	r8
153	beq+	61f
154	li	r3,0
15570:	lbz	r9,4(r4)		/* do some bytes */
156	addi	r4,r4,1
157	slwi	r3,r3,8
158	rlwimi	r3,r9,0,24,31
15971:	stb	r9,4(r6)
160	addi	r6,r6,1
161	bdnz	70b
162	adde	r12,r12,r3
16361:	subf	r5,r0,r5
164	srwi.	r0,r0,2
165	mtctr	r0
166	beq	58f
16772:	lwzu	r9,4(r4)		/* do some words */
168	adde	r12,r12,r9
16973:	stwu	r9,4(r6)
170	bdnz	72b
171
17258:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
173	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
174	li	r11,4
175	beq	63f
176
177	/* Here we decide how far ahead to prefetch the source */
178	li	r3,4
179	cmpwi	r0,1
180	li	r7,0
181	ble	114f
182	li	r7,1
183#if MAX_COPY_PREFETCH > 1
184	/* Heuristically, for large transfers we prefetch
185	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
186	   we prefetch 1 cacheline ahead. */
187	cmpwi	r0,MAX_COPY_PREFETCH
188	ble	112f
189	li	r7,MAX_COPY_PREFETCH
190112:	mtctr	r7
191111:	dcbt	r3,r4
192	addi	r3,r3,CACHELINE_BYTES
193	bdnz	111b
194#else
195	dcbt	r3,r4
196	addi	r3,r3,CACHELINE_BYTES
197#endif /* MAX_COPY_PREFETCH > 1 */
198
199114:	subf	r8,r7,r0
200	mr	r0,r7
201	mtctr	r8
202
20353:	dcbt	r3,r4
20454:	dcbz	r11,r6
205/* the main body of the cacheline loop */
206	CSUM_COPY_16_BYTES_WITHEX(0)
207#if L1_CACHE_BYTES >= 32
208	CSUM_COPY_16_BYTES_WITHEX(1)
209#if L1_CACHE_BYTES >= 64
210	CSUM_COPY_16_BYTES_WITHEX(2)
211	CSUM_COPY_16_BYTES_WITHEX(3)
212#if L1_CACHE_BYTES >= 128
213	CSUM_COPY_16_BYTES_WITHEX(4)
214	CSUM_COPY_16_BYTES_WITHEX(5)
215	CSUM_COPY_16_BYTES_WITHEX(6)
216	CSUM_COPY_16_BYTES_WITHEX(7)
217#endif
218#endif
219#endif
220	bdnz	53b
221	cmpwi	r0,0
222	li	r3,4
223	li	r7,0
224	bne	114b
225
22663:	srwi.	r0,r5,2
227	mtctr	r0
228	beq	64f
22930:	lwzu	r0,4(r4)
230	adde	r12,r12,r0
23131:	stwu	r0,4(r6)
232	bdnz	30b
233
23464:	andi.	r0,r5,2
235	beq+	65f
23640:	lhz	r0,4(r4)
237	addi	r4,r4,2
23841:	sth	r0,4(r6)
239	adde	r12,r12,r0
240	addi	r6,r6,2
24165:	andi.	r0,r5,1
242	beq+	66f
24350:	lbz	r0,4(r4)
24451:	stb	r0,4(r6)
245	slwi	r0,r0,8
246	adde	r12,r12,r0
24766:	addze	r3,r12
248	addi	r1,r1,16
249	beqlr+	cr7
250	rlwinm	r3,r3,8,0,31	/* odd destination address: rotate one byte */
251	blr
252
253/* read fault */
254src_error:
255	lwz	r7,12(r1)
256	addi	r1,r1,16
257	cmpwi	cr0,r7,0
258	beqlr
259	li	r0,-EFAULT
260	stw	r0,0(r7)
261	blr
262/* write fault */
263dst_error:
264	lwz	r8,8(r1)
265	addi	r1,r1,16
266	cmpwi	cr0,r8,0
267	beqlr
268	li	r0,-EFAULT
269	stw	r0,0(r8)
270	blr
271
272	EX_TABLE(70b, src_error);
273	EX_TABLE(71b, dst_error);
274	EX_TABLE(72b, src_error);
275	EX_TABLE(73b, dst_error);
276	EX_TABLE(54b, dst_error);
277
278/*
279 * this stuff handles faults in the cacheline loop and branches to either
280 * src_error (if in read part) or dst_error (if in write part)
281 */
282	CSUM_COPY_16_BYTES_EXCODE(0)
283#if L1_CACHE_BYTES >= 32
284	CSUM_COPY_16_BYTES_EXCODE(1)
285#if L1_CACHE_BYTES >= 64
286	CSUM_COPY_16_BYTES_EXCODE(2)
287	CSUM_COPY_16_BYTES_EXCODE(3)
288#if L1_CACHE_BYTES >= 128
289	CSUM_COPY_16_BYTES_EXCODE(4)
290	CSUM_COPY_16_BYTES_EXCODE(5)
291	CSUM_COPY_16_BYTES_EXCODE(6)
292	CSUM_COPY_16_BYTES_EXCODE(7)
293#endif
294#endif
295#endif
296
297	EX_TABLE(30b, src_error);
298	EX_TABLE(31b, dst_error);
299	EX_TABLE(40b, src_error);
300	EX_TABLE(41b, dst_error);
301	EX_TABLE(50b, src_error);
302	EX_TABLE(51b, dst_error);
303
304EXPORT_SYMBOL(csum_partial_copy_generic)
305
306/*
307 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
308 *			   const struct in6_addr *daddr,
309 *			   __u32 len, __u8 proto, __wsum sum)
310 */
311
312_GLOBAL(csum_ipv6_magic)
313	lwz	r8, 0(r3)
314	lwz	r9, 4(r3)
315	addc	r0, r7, r8
316	lwz	r10, 8(r3)
317	adde	r0, r0, r9
318	lwz	r11, 12(r3)
319	adde	r0, r0, r10
320	lwz	r8, 0(r4)
321	adde	r0, r0, r11
322	lwz	r9, 4(r4)
323	adde	r0, r0, r8
324	lwz	r10, 8(r4)
325	adde	r0, r0, r9
326	lwz	r11, 12(r4)
327	adde	r0, r0, r10
328	add	r5, r5, r6	/* assumption: len + proto doesn't carry */
329	adde	r0, r0, r11
330	adde	r0, r0, r5
331	addze	r0, r0
332	rotlwi	r3, r0, 16
333	add	r3, r0, r3
334	not	r3, r3
335	rlwinm	r3, r3, 16, 16, 31
336	blr
337EXPORT_SYMBOL(csum_ipv6_magic)
338