xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision 9b83ecb0)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed.  So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28	lwz	r0,0(r3)
29	lwzu	r5,4(r3)
30	addic.	r4,r4,-2
31	addc	r0,r0,r5
32	mtctr	r4
33	blelr-
341:	lwzu	r4,4(r3)
35	adde	r0,r0,r4
36	bdnz	1b
37	addze	r0,r0		/* add in final carry */
38        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
39        add     r0,r0,r4
40        srdi    r0,r0,32
41	rlwinm	r3,r0,16,0,31	/* fold two halves together */
42	add	r3,r0,r3
43	not	r3,r3
44	srwi	r3,r3,16
45	blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
56	addc	r0,r3,r4	/* add 4 32-bit words together */
57	adde	r0,r0,r5
58	adde	r0,r0,r7
59        rldicl  r4,r0,32,0      /* fold 64 bit value */
60        add     r0,r4,r0
61        srdi    r0,r0,32
62	rlwinm	r3,r0,16,0,31	/* fold two halves together */
63	add	r3,r0,r3
64	not	r3,r3
65	srwi	r3,r3,16
66	blr
67
68#define STACKFRAMESIZE 256
69#define STK_REG(i)	(112 + ((i)-14)*8)
70
71/*
72 * Computes the checksum of a memory block at buff, length len,
73 * and adds in "sum" (32-bit).
74 *
75 * csum_partial(r3=buff, r4=len, r5=sum)
76 */
77_GLOBAL(csum_partial)
78	addic	r0,r5,0			/* clear carry */
79
80	srdi.	r6,r4,3			/* less than 8 bytes? */
81	beq	.Lcsum_tail_word
82
83	/*
84	 * If only halfword aligned, align to a double word. Since odd
85	 * aligned addresses should be rare and they would require more
86	 * work to calculate the correct checksum, we ignore that case
87	 * and take the potential slowdown of unaligned loads.
88	 */
89	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
90	beq	.Lcsum_aligned
91
92	li	r7,4
93	sub	r6,r7,r6
94	mtctr	r6
95
961:
97	lhz	r6,0(r3)		/* align to doubleword */
98	subi	r4,r4,2
99	addi	r3,r3,2
100	adde	r0,r0,r6
101	bdnz	1b
102
103.Lcsum_aligned:
104	/*
105	 * We unroll the loop such that each iteration is 64 bytes with an
106	 * entry and exit limb of 64 bytes, meaning a minimum size of
107	 * 128 bytes.
108	 */
109	srdi.	r6,r4,7
110	beq	.Lcsum_tail_doublewords		/* len < 128 */
111
112	srdi	r6,r4,6
113	subi	r6,r6,1
114	mtctr	r6
115
116	stdu	r1,-STACKFRAMESIZE(r1)
117	std	r14,STK_REG(r14)(r1)
118	std	r15,STK_REG(r15)(r1)
119	std	r16,STK_REG(r16)(r1)
120
121	ld	r6,0(r3)
122	ld	r9,8(r3)
123
124	ld	r10,16(r3)
125	ld	r11,24(r3)
126
127	/*
128	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129	 * the XER dependency. This means the fastest this loop can go is
130	 * 16 cycles per iteration. The scheduling of the loop below has
131	 * been shown to hit this on both POWER6 and POWER7.
132	 */
133	.align 5
1342:
135	adde	r0,r0,r6
136	ld	r12,32(r3)
137	ld	r14,40(r3)
138
139	adde	r0,r0,r9
140	ld	r15,48(r3)
141	ld	r16,56(r3)
142	addi	r3,r3,64
143
144	adde	r0,r0,r10
145
146	adde	r0,r0,r11
147
148	adde	r0,r0,r12
149
150	adde	r0,r0,r14
151
152	adde	r0,r0,r15
153	ld	r6,0(r3)
154	ld	r9,8(r3)
155
156	adde	r0,r0,r16
157	ld	r10,16(r3)
158	ld	r11,24(r3)
159	bdnz	2b
160
161
162	adde	r0,r0,r6
163	ld	r12,32(r3)
164	ld	r14,40(r3)
165
166	adde	r0,r0,r9
167	ld	r15,48(r3)
168	ld	r16,56(r3)
169	addi	r3,r3,64
170
171	adde	r0,r0,r10
172	adde	r0,r0,r11
173	adde	r0,r0,r12
174	adde	r0,r0,r14
175	adde	r0,r0,r15
176	adde	r0,r0,r16
177
178	ld	r14,STK_REG(r14)(r1)
179	ld	r15,STK_REG(r15)(r1)
180	ld	r16,STK_REG(r16)(r1)
181	addi	r1,r1,STACKFRAMESIZE
182
183	andi.	r4,r4,63
184
185.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
186	srdi.	r6,r4,3
187	beq	.Lcsum_tail_word
188
189	mtctr	r6
1903:
191	ld	r6,0(r3)
192	addi	r3,r3,8
193	adde	r0,r0,r6
194	bdnz	3b
195
196	andi.	r4,r4,7
197
198.Lcsum_tail_word:			/* Up to 7 bytes to go */
199	srdi.	r6,r4,2
200	beq	.Lcsum_tail_halfword
201
202	lwz	r6,0(r3)
203	addi	r3,r3,4
204	adde	r0,r0,r6
205	subi	r4,r4,4
206
207.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
208	srdi.	r6,r4,1
209	beq	.Lcsum_tail_byte
210
211	lhz	r6,0(r3)
212	addi	r3,r3,2
213	adde	r0,r0,r6
214	subi	r4,r4,2
215
216.Lcsum_tail_byte:			/* Up to 1 byte to go */
217	andi.	r6,r4,1
218	beq	.Lcsum_finish
219
220	lbz	r6,0(r3)
221	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
222	adde	r0,r0,r9
223
224.Lcsum_finish:
225	addze	r0,r0			/* add in final carry */
226	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
227	add	r3,r4,r0
228	srdi	r3,r3,32
229	blr
230
231/*
232 * Computes the checksum of a memory block at src, length len,
233 * and adds in "sum" (32-bit), while copying the block to dst.
234 * If an access exception occurs on src or dst, it stores -EFAULT
235 * to *src_err or *dst_err respectively, and (for an error on
236 * src) zeroes the rest of dst.
237 *
238 * This code needs to be reworked to take advantage of 64 bit sum+copy.
239 * However, due to tokenring halfword alignment problems this will be very
240 * tricky.  For now we'll leave it until we instrument it somehow.
241 *
242 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
243 */
244_GLOBAL(csum_partial_copy_generic)
245	addic	r0,r6,0
246	subi	r3,r3,4
247	subi	r4,r4,4
248	srwi.	r6,r5,2
249	beq	3f		/* if we're doing < 4 bytes */
250	andi.	r9,r4,2		/* Align dst to longword boundary */
251	beq+	1f
25281:	lhz	r6,4(r3)	/* do 2 bytes to get aligned */
253	addi	r3,r3,2
254	subi	r5,r5,2
25591:	sth	r6,4(r4)
256	addi	r4,r4,2
257	addc	r0,r0,r6
258	srwi.	r6,r5,2		/* # words to do */
259	beq	3f
2601:	mtctr	r6
26182:	lwzu	r6,4(r3)	/* the bdnz has zero overhead, so it should */
26292:	stwu	r6,4(r4)	/* be unnecessary to unroll this loop */
263	adde	r0,r0,r6
264	bdnz	82b
265	andi.	r5,r5,3
2663:	cmpwi	0,r5,2
267	blt+	4f
26883:	lhz	r6,4(r3)
269	addi	r3,r3,2
270	subi	r5,r5,2
27193:	sth	r6,4(r4)
272	addi	r4,r4,2
273	adde	r0,r0,r6
2744:	cmpwi	0,r5,1
275	bne+	5f
27684:	lbz	r6,4(r3)
27794:	stb	r6,4(r4)
278	slwi	r6,r6,8		/* Upper byte of word */
279	adde	r0,r0,r6
2805:	addze	r3,r0		/* add in final carry (unlikely with 64-bit regs) */
281        rldicl  r4,r3,32,0      /* fold 64 bit value */
282        add     r3,r4,r3
283        srdi    r3,r3,32
284	blr
285
286/* These shouldn't go in the fixup section, since that would
287   cause the ex_table addresses to get out of order. */
288
289	.globl src_error_1
290src_error_1:
291	li	r6,0
292	subi	r5,r5,2
29395:	sth	r6,4(r4)
294	addi	r4,r4,2
295	srwi.	r6,r5,2
296	beq	3f
297	mtctr	r6
298	.globl src_error_2
299src_error_2:
300	li	r6,0
30196:	stwu	r6,4(r4)
302	bdnz	96b
3033:	andi.	r5,r5,3
304	beq	src_error
305	.globl src_error_3
306src_error_3:
307	li	r6,0
308	mtctr	r5
309	addi	r4,r4,3
31097:	stbu	r6,1(r4)
311	bdnz	97b
312	.globl src_error
313src_error:
314	cmpdi	0,r7,0
315	beq	1f
316	li	r6,-EFAULT
317	stw	r6,0(r7)
3181:	addze	r3,r0
319	blr
320
321	.globl dst_error
322dst_error:
323	cmpdi	0,r8,0
324	beq	1f
325	li	r6,-EFAULT
326	stw	r6,0(r8)
3271:	addze	r3,r0
328	blr
329
330.section __ex_table,"a"
331	.align  3
332	.llong	81b,src_error_1
333	.llong	91b,dst_error
334	.llong	82b,src_error_2
335	.llong	92b,dst_error
336	.llong	83b,src_error_3
337	.llong	93b,dst_error
338	.llong	84b,src_error_3
339	.llong	94b,dst_error
340	.llong	95b,dst_error
341	.llong	96b,dst_error
342	.llong	97b,dst_error
343