xref: /openbmc/linux/arch/powerpc/lib/checksum_32.S (revision 55fd7e02)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/sys.h>
12#include <asm/processor.h>
13#include <asm/cache.h>
14#include <asm/errno.h>
15#include <asm/ppc_asm.h>
16#include <asm/export.h>
17
18	.text
19
20/*
21 * computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit)
23 *
24 * __csum_partial(buff, len, sum)
25 */
26_GLOBAL(__csum_partial)
27	subi	r3,r3,4
28	srawi.	r6,r4,2		/* Divide len by 4 and also clear carry */
29	beq	3f		/* if we're doing < 4 bytes */
30	andi.	r0,r3,2		/* Align buffer to longword boundary */
31	beq+	1f
32	lhz	r0,4(r3)	/* do 2 bytes to get aligned */
33	subi	r4,r4,2
34	addi	r3,r3,2
35	srwi.	r6,r4,2		/* # words to do */
36	adde	r5,r5,r0
37	beq	3f
381:	andi.	r6,r6,3		/* Prepare to handle words 4 by 4 */
39	beq	21f
40	mtctr	r6
412:	lwzu	r0,4(r3)
42	adde	r5,r5,r0
43	bdnz	2b
4421:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
45	beq	3f
46	lwz	r0,4(r3)
47	mtctr	r6
48	lwz	r6,8(r3)
49	adde	r5,r5,r0
50	lwz	r7,12(r3)
51	adde	r5,r5,r6
52	lwzu	r8,16(r3)
53	adde	r5,r5,r7
54	bdz	23f
5522:	lwz	r0,4(r3)
56	adde	r5,r5,r8
57	lwz	r6,8(r3)
58	adde	r5,r5,r0
59	lwz	r7,12(r3)
60	adde	r5,r5,r6
61	lwzu	r8,16(r3)
62	adde	r5,r5,r7
63	bdnz	22b
6423:	adde	r5,r5,r8
653:	andi.	r0,r4,2
66	beq+	4f
67	lhz	r0,4(r3)
68	addi	r3,r3,2
69	adde	r5,r5,r0
704:	andi.	r0,r4,1
71	beq+	5f
72	lbz	r0,4(r3)
73	slwi	r0,r0,8		/* Upper byte of word */
74	adde	r5,r5,r0
755:	addze	r3,r5		/* add in final carry */
76	blr
77EXPORT_SYMBOL(__csum_partial)
78
79/*
80 * Computes the checksum of a memory block at src, length len,
81 * and adds in "sum" (32-bit), while copying the block to dst.
82 * If an access exception occurs on src or dst, it stores -EFAULT
83 * to *src_err or *dst_err respectively, and (for an error on
84 * src) zeroes the rest of dst.
85 *
86 * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
87 */
88#define CSUM_COPY_16_BYTES_WITHEX(n)	\
898 ## n ## 0:			\
90	lwz	r7,4(r4);	\
918 ## n ## 1:			\
92	lwz	r8,8(r4);	\
938 ## n ## 2:			\
94	lwz	r9,12(r4);	\
958 ## n ## 3:			\
96	lwzu	r10,16(r4);	\
978 ## n ## 4:			\
98	stw	r7,4(r6);	\
99	adde	r12,r12,r7;	\
1008 ## n ## 5:			\
101	stw	r8,8(r6);	\
102	adde	r12,r12,r8;	\
1038 ## n ## 6:			\
104	stw	r9,12(r6);	\
105	adde	r12,r12,r9;	\
1068 ## n ## 7:			\
107	stwu	r10,16(r6);	\
108	adde	r12,r12,r10
109
110#define CSUM_COPY_16_BYTES_EXCODE(n)		\
111	EX_TABLE(8 ## n ## 0b, src_error);	\
112	EX_TABLE(8 ## n ## 1b, src_error);	\
113	EX_TABLE(8 ## n ## 2b, src_error);	\
114	EX_TABLE(8 ## n ## 3b, src_error);	\
115	EX_TABLE(8 ## n ## 4b, dst_error);	\
116	EX_TABLE(8 ## n ## 5b, dst_error);	\
117	EX_TABLE(8 ## n ## 6b, dst_error);	\
118	EX_TABLE(8 ## n ## 7b, dst_error);
119
120	.text
121	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
122	.stabs	"checksum_32.S",N_SO,0,0,0f
1230:
124
125CACHELINE_BYTES = L1_CACHE_BYTES
126LG_CACHELINE_BYTES = L1_CACHE_SHIFT
127CACHELINE_MASK = (L1_CACHE_BYTES-1)
128
129_GLOBAL(csum_partial_copy_generic)
130	stwu	r1,-16(r1)
131	stw	r7,12(r1)
132	stw	r8,8(r1)
133
134	addic	r12,r6,0
135	addi	r6,r4,-4
136	neg	r0,r4
137	addi	r4,r3,-4
138	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
139	crset	4*cr7+eq
140	beq	58f
141
142	cmplw	0,r5,r0			/* is this more than total to do? */
143	blt	63f			/* if not much to do */
144	rlwinm	r7,r6,3,0x8
145	rlwnm	r12,r12,r7,0,31	/* odd destination address: rotate one byte */
146	cmplwi	cr7,r7,0	/* is destination address even ? */
147	andi.	r8,r0,3			/* get it word-aligned first */
148	mtctr	r8
149	beq+	61f
150	li	r3,0
15170:	lbz	r9,4(r4)		/* do some bytes */
152	addi	r4,r4,1
153	slwi	r3,r3,8
154	rlwimi	r3,r9,0,24,31
15571:	stb	r9,4(r6)
156	addi	r6,r6,1
157	bdnz	70b
158	adde	r12,r12,r3
15961:	subf	r5,r0,r5
160	srwi.	r0,r0,2
161	mtctr	r0
162	beq	58f
16372:	lwzu	r9,4(r4)		/* do some words */
164	adde	r12,r12,r9
16573:	stwu	r9,4(r6)
166	bdnz	72b
167
16858:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
169	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
170	li	r11,4
171	beq	63f
172
173	/* Here we decide how far ahead to prefetch the source */
174	li	r3,4
175	cmpwi	r0,1
176	li	r7,0
177	ble	114f
178	li	r7,1
179#if MAX_COPY_PREFETCH > 1
180	/* Heuristically, for large transfers we prefetch
181	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
182	   we prefetch 1 cacheline ahead. */
183	cmpwi	r0,MAX_COPY_PREFETCH
184	ble	112f
185	li	r7,MAX_COPY_PREFETCH
186112:	mtctr	r7
187111:	dcbt	r3,r4
188	addi	r3,r3,CACHELINE_BYTES
189	bdnz	111b
190#else
191	dcbt	r3,r4
192	addi	r3,r3,CACHELINE_BYTES
193#endif /* MAX_COPY_PREFETCH > 1 */
194
195114:	subf	r8,r7,r0
196	mr	r0,r7
197	mtctr	r8
198
19953:	dcbt	r3,r4
20054:	dcbz	r11,r6
201/* the main body of the cacheline loop */
202	CSUM_COPY_16_BYTES_WITHEX(0)
203#if L1_CACHE_BYTES >= 32
204	CSUM_COPY_16_BYTES_WITHEX(1)
205#if L1_CACHE_BYTES >= 64
206	CSUM_COPY_16_BYTES_WITHEX(2)
207	CSUM_COPY_16_BYTES_WITHEX(3)
208#if L1_CACHE_BYTES >= 128
209	CSUM_COPY_16_BYTES_WITHEX(4)
210	CSUM_COPY_16_BYTES_WITHEX(5)
211	CSUM_COPY_16_BYTES_WITHEX(6)
212	CSUM_COPY_16_BYTES_WITHEX(7)
213#endif
214#endif
215#endif
216	bdnz	53b
217	cmpwi	r0,0
218	li	r3,4
219	li	r7,0
220	bne	114b
221
22263:	srwi.	r0,r5,2
223	mtctr	r0
224	beq	64f
22530:	lwzu	r0,4(r4)
226	adde	r12,r12,r0
22731:	stwu	r0,4(r6)
228	bdnz	30b
229
23064:	andi.	r0,r5,2
231	beq+	65f
23240:	lhz	r0,4(r4)
233	addi	r4,r4,2
23441:	sth	r0,4(r6)
235	adde	r12,r12,r0
236	addi	r6,r6,2
23765:	andi.	r0,r5,1
238	beq+	66f
23950:	lbz	r0,4(r4)
24051:	stb	r0,4(r6)
241	slwi	r0,r0,8
242	adde	r12,r12,r0
24366:	addze	r3,r12
244	addi	r1,r1,16
245	beqlr+	cr7
246	rlwinm	r3,r3,8,0,31	/* odd destination address: rotate one byte */
247	blr
248
249/* read fault */
250src_error:
251	lwz	r7,12(r1)
252	addi	r1,r1,16
253	cmpwi	cr0,r7,0
254	beqlr
255	li	r0,-EFAULT
256	stw	r0,0(r7)
257	blr
258/* write fault */
259dst_error:
260	lwz	r8,8(r1)
261	addi	r1,r1,16
262	cmpwi	cr0,r8,0
263	beqlr
264	li	r0,-EFAULT
265	stw	r0,0(r8)
266	blr
267
268	EX_TABLE(70b, src_error);
269	EX_TABLE(71b, dst_error);
270	EX_TABLE(72b, src_error);
271	EX_TABLE(73b, dst_error);
272	EX_TABLE(54b, dst_error);
273
274/*
275 * this stuff handles faults in the cacheline loop and branches to either
276 * src_error (if in read part) or dst_error (if in write part)
277 */
278	CSUM_COPY_16_BYTES_EXCODE(0)
279#if L1_CACHE_BYTES >= 32
280	CSUM_COPY_16_BYTES_EXCODE(1)
281#if L1_CACHE_BYTES >= 64
282	CSUM_COPY_16_BYTES_EXCODE(2)
283	CSUM_COPY_16_BYTES_EXCODE(3)
284#if L1_CACHE_BYTES >= 128
285	CSUM_COPY_16_BYTES_EXCODE(4)
286	CSUM_COPY_16_BYTES_EXCODE(5)
287	CSUM_COPY_16_BYTES_EXCODE(6)
288	CSUM_COPY_16_BYTES_EXCODE(7)
289#endif
290#endif
291#endif
292
293	EX_TABLE(30b, src_error);
294	EX_TABLE(31b, dst_error);
295	EX_TABLE(40b, src_error);
296	EX_TABLE(41b, dst_error);
297	EX_TABLE(50b, src_error);
298	EX_TABLE(51b, dst_error);
299
300EXPORT_SYMBOL(csum_partial_copy_generic)
301
302/*
303 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
304 *			   const struct in6_addr *daddr,
305 *			   __u32 len, __u8 proto, __wsum sum)
306 */
307
308_GLOBAL(csum_ipv6_magic)
309	lwz	r8, 0(r3)
310	lwz	r9, 4(r3)
311	addc	r0, r7, r8
312	lwz	r10, 8(r3)
313	adde	r0, r0, r9
314	lwz	r11, 12(r3)
315	adde	r0, r0, r10
316	lwz	r8, 0(r4)
317	adde	r0, r0, r11
318	lwz	r9, 4(r4)
319	adde	r0, r0, r8
320	lwz	r10, 8(r4)
321	adde	r0, r0, r9
322	lwz	r11, 12(r4)
323	adde	r0, r0, r10
324	add	r5, r5, r6	/* assumption: len + proto doesn't carry */
325	adde	r0, r0, r11
326	adde	r0, r0, r5
327	addze	r0, r0
328	rotlwi	r3, r0, 16
329	add	r3, r0, r3
330	not	r3, r3
331	rlwinm	r3, r3, 16, 16, 31
332	blr
333EXPORT_SYMBOL(csum_ipv6_magic)
334