xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision df2634f43f5106947f3735a0b61a6527a4b278cd)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed.  So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28	lwz	r0,0(r3)
29	lwzu	r5,4(r3)
30	addic.	r4,r4,-2
31	addc	r0,r0,r5
32	mtctr	r4
33	blelr-
341:	lwzu	r4,4(r3)
35	adde	r0,r0,r4
36	bdnz	1b
37	addze	r0,r0		/* add in final carry */
38        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
39        add     r0,r0,r4
40        srdi    r0,r0,32
41	rlwinm	r3,r0,16,0,31	/* fold two halves together */
42	add	r3,r0,r3
43	not	r3,r3
44	srwi	r3,r3,16
45	blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
56	addc	r0,r3,r4	/* add 4 32-bit words together */
57	adde	r0,r0,r5
58	adde	r0,r0,r7
59        rldicl  r4,r0,32,0      /* fold 64 bit value */
60        add     r0,r4,r0
61        srdi    r0,r0,32
62	rlwinm	r3,r0,16,0,31	/* fold two halves together */
63	add	r3,r0,r3
64	not	r3,r3
65	srwi	r3,r3,16
66	blr
67
68#define STACKFRAMESIZE 256
69#define STK_REG(i)	(112 + ((i)-14)*8)
70
71/*
72 * Computes the checksum of a memory block at buff, length len,
73 * and adds in "sum" (32-bit).
74 *
75 * csum_partial(r3=buff, r4=len, r5=sum)
76 */
77_GLOBAL(csum_partial)
78	addic	r0,r5,0			/* clear carry */
79
80	srdi.	r6,r4,3			/* less than 8 bytes? */
81	beq	.Lcsum_tail_word
82
83	/*
84	 * If only halfword aligned, align to a double word. Since odd
85	 * aligned addresses should be rare and they would require more
86	 * work to calculate the correct checksum, we ignore that case
87	 * and take the potential slowdown of unaligned loads.
88	 */
89	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
90	beq	.Lcsum_aligned
91
92	li	r7,4
93	sub	r6,r7,r6
94	mtctr	r6
95
961:
97	lhz	r6,0(r3)		/* align to doubleword */
98	subi	r4,r4,2
99	addi	r3,r3,2
100	adde	r0,r0,r6
101	bdnz	1b
102
103.Lcsum_aligned:
104	/*
105	 * We unroll the loop such that each iteration is 64 bytes with an
106	 * entry and exit limb of 64 bytes, meaning a minimum size of
107	 * 128 bytes.
108	 */
109	srdi.	r6,r4,7
110	beq	.Lcsum_tail_doublewords		/* len < 128 */
111
112	srdi	r6,r4,6
113	subi	r6,r6,1
114	mtctr	r6
115
116	stdu	r1,-STACKFRAMESIZE(r1)
117	std	r14,STK_REG(r14)(r1)
118	std	r15,STK_REG(r15)(r1)
119	std	r16,STK_REG(r16)(r1)
120
121	ld	r6,0(r3)
122	ld	r9,8(r3)
123
124	ld	r10,16(r3)
125	ld	r11,24(r3)
126
127	/*
128	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129	 * the XER dependency. This means the fastest this loop can go is
130	 * 16 cycles per iteration. The scheduling of the loop below has
131	 * been shown to hit this on both POWER6 and POWER7.
132	 */
133	.align 5
1342:
135	adde	r0,r0,r6
136	ld	r12,32(r3)
137	ld	r14,40(r3)
138
139	adde	r0,r0,r9
140	ld	r15,48(r3)
141	ld	r16,56(r3)
142	addi	r3,r3,64
143
144	adde	r0,r0,r10
145
146	adde	r0,r0,r11
147
148	adde	r0,r0,r12
149
150	adde	r0,r0,r14
151
152	adde	r0,r0,r15
153	ld	r6,0(r3)
154	ld	r9,8(r3)
155
156	adde	r0,r0,r16
157	ld	r10,16(r3)
158	ld	r11,24(r3)
159	bdnz	2b
160
161
162	adde	r0,r0,r6
163	ld	r12,32(r3)
164	ld	r14,40(r3)
165
166	adde	r0,r0,r9
167	ld	r15,48(r3)
168	ld	r16,56(r3)
169	addi	r3,r3,64
170
171	adde	r0,r0,r10
172	adde	r0,r0,r11
173	adde	r0,r0,r12
174	adde	r0,r0,r14
175	adde	r0,r0,r15
176	adde	r0,r0,r16
177
178	ld	r14,STK_REG(r14)(r1)
179	ld	r15,STK_REG(r15)(r1)
180	ld	r16,STK_REG(r16)(r1)
181	addi	r1,r1,STACKFRAMESIZE
182
183	andi.	r4,r4,63
184
185.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
186	srdi.	r6,r4,3
187	beq	.Lcsum_tail_word
188
189	mtctr	r6
1903:
191	ld	r6,0(r3)
192	addi	r3,r3,8
193	adde	r0,r0,r6
194	bdnz	3b
195
196	andi.	r4,r4,7
197
198.Lcsum_tail_word:			/* Up to 7 bytes to go */
199	srdi.	r6,r4,2
200	beq	.Lcsum_tail_halfword
201
202	lwz	r6,0(r3)
203	addi	r3,r3,4
204	adde	r0,r0,r6
205	subi	r4,r4,4
206
207.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
208	srdi.	r6,r4,1
209	beq	.Lcsum_tail_byte
210
211	lhz	r6,0(r3)
212	addi	r3,r3,2
213	adde	r0,r0,r6
214	subi	r4,r4,2
215
216.Lcsum_tail_byte:			/* Up to 1 byte to go */
217	andi.	r6,r4,1
218	beq	.Lcsum_finish
219
220	lbz	r6,0(r3)
221	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
222	adde	r0,r0,r9
223
224.Lcsum_finish:
225	addze	r0,r0			/* add in final carry */
226	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
227	add	r3,r4,r0
228	srdi	r3,r3,32
229	blr
230
231
232	.macro source
233100:
234	.section __ex_table,"a"
235	.align 3
236	.llong 100b,.Lsrc_error
237	.previous
238	.endm
239
240	.macro dest
241200:
242	.section __ex_table,"a"
243	.align 3
244	.llong 200b,.Ldest_error
245	.previous
246	.endm
247
248/*
249 * Computes the checksum of a memory block at src, length len,
250 * and adds in "sum" (32-bit), while copying the block to dst.
251 * If an access exception occurs on src or dst, it stores -EFAULT
252 * to *src_err or *dst_err respectively. The caller must take any action
253 * required in this case (zeroing memory, recalculating partial checksum etc).
254 *
255 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
256 */
257_GLOBAL(csum_partial_copy_generic)
258	addic	r0,r6,0			/* clear carry */
259
260	srdi.	r6,r5,3			/* less than 8 bytes? */
261	beq	.Lcopy_tail_word
262
263	/*
264	 * If only halfword aligned, align to a double word. Since odd
265	 * aligned addresses should be rare and they would require more
266	 * work to calculate the correct checksum, we ignore that case
267	 * and take the potential slowdown of unaligned loads.
268	 *
269	 * If the source and destination are relatively unaligned we only
270	 * align the source. This keeps things simple.
271	 */
272	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
273	beq	.Lcopy_aligned
274
275	li	r7,4
276	sub	r6,r7,r6
277	mtctr	r6
278
2791:
280source;	lhz	r6,0(r3)		/* align to doubleword */
281	subi	r5,r5,2
282	addi	r3,r3,2
283	adde	r0,r0,r6
284dest;	sth	r6,0(r4)
285	addi	r4,r4,2
286	bdnz	1b
287
288.Lcopy_aligned:
289	/*
290	 * We unroll the loop such that each iteration is 64 bytes with an
291	 * entry and exit limb of 64 bytes, meaning a minimum size of
292	 * 128 bytes.
293	 */
294	srdi.	r6,r5,7
295	beq	.Lcopy_tail_doublewords		/* len < 128 */
296
297	srdi	r6,r5,6
298	subi	r6,r6,1
299	mtctr	r6
300
301	stdu	r1,-STACKFRAMESIZE(r1)
302	std	r14,STK_REG(r14)(r1)
303	std	r15,STK_REG(r15)(r1)
304	std	r16,STK_REG(r16)(r1)
305
306source;	ld	r6,0(r3)
307source;	ld	r9,8(r3)
308
309source;	ld	r10,16(r3)
310source;	ld	r11,24(r3)
311
312	/*
313	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
314	 * the XER dependency. This means the fastest this loop can go is
315	 * 16 cycles per iteration. The scheduling of the loop below has
316	 * been shown to hit this on both POWER6 and POWER7.
317	 */
318	.align 5
3192:
320	adde	r0,r0,r6
321source;	ld	r12,32(r3)
322source;	ld	r14,40(r3)
323
324	adde	r0,r0,r9
325source;	ld	r15,48(r3)
326source;	ld	r16,56(r3)
327	addi	r3,r3,64
328
329	adde	r0,r0,r10
330dest;	std	r6,0(r4)
331dest;	std	r9,8(r4)
332
333	adde	r0,r0,r11
334dest;	std	r10,16(r4)
335dest;	std	r11,24(r4)
336
337	adde	r0,r0,r12
338dest;	std	r12,32(r4)
339dest;	std	r14,40(r4)
340
341	adde	r0,r0,r14
342dest;	std	r15,48(r4)
343dest;	std	r16,56(r4)
344	addi	r4,r4,64
345
346	adde	r0,r0,r15
347source;	ld	r6,0(r3)
348source;	ld	r9,8(r3)
349
350	adde	r0,r0,r16
351source;	ld	r10,16(r3)
352source;	ld	r11,24(r3)
353	bdnz	2b
354
355
356	adde	r0,r0,r6
357source;	ld	r12,32(r3)
358source;	ld	r14,40(r3)
359
360	adde	r0,r0,r9
361source;	ld	r15,48(r3)
362source;	ld	r16,56(r3)
363	addi	r3,r3,64
364
365	adde	r0,r0,r10
366dest;	std	r6,0(r4)
367dest;	std	r9,8(r4)
368
369	adde	r0,r0,r11
370dest;	std	r10,16(r4)
371dest;	std	r11,24(r4)
372
373	adde	r0,r0,r12
374dest;	std	r12,32(r4)
375dest;	std	r14,40(r4)
376
377	adde	r0,r0,r14
378dest;	std	r15,48(r4)
379dest;	std	r16,56(r4)
380	addi	r4,r4,64
381
382	adde	r0,r0,r15
383	adde	r0,r0,r16
384
385	ld	r14,STK_REG(r14)(r1)
386	ld	r15,STK_REG(r15)(r1)
387	ld	r16,STK_REG(r16)(r1)
388	addi	r1,r1,STACKFRAMESIZE
389
390	andi.	r5,r5,63
391
392.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
393	srdi.	r6,r5,3
394	beq	.Lcopy_tail_word
395
396	mtctr	r6
3973:
398source;	ld	r6,0(r3)
399	addi	r3,r3,8
400	adde	r0,r0,r6
401dest;	std	r6,0(r4)
402	addi	r4,r4,8
403	bdnz	3b
404
405	andi.	r5,r5,7
406
407.Lcopy_tail_word:			/* Up to 7 bytes to go */
408	srdi.	r6,r5,2
409	beq	.Lcopy_tail_halfword
410
411source;	lwz	r6,0(r3)
412	addi	r3,r3,4
413	adde	r0,r0,r6
414dest;	stw	r6,0(r4)
415	addi	r4,r4,4
416	subi	r5,r5,4
417
418.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
419	srdi.	r6,r5,1
420	beq	.Lcopy_tail_byte
421
422source;	lhz	r6,0(r3)
423	addi	r3,r3,2
424	adde	r0,r0,r6
425dest;	sth	r6,0(r4)
426	addi	r4,r4,2
427	subi	r5,r5,2
428
429.Lcopy_tail_byte:			/* Up to 1 byte to go */
430	andi.	r6,r5,1
431	beq	.Lcopy_finish
432
433source;	lbz	r6,0(r3)
434	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
435	adde	r0,r0,r9
436dest;	stb	r6,0(r4)
437
438.Lcopy_finish:
439	addze	r0,r0			/* add in final carry */
440	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
441	add	r3,r4,r0
442	srdi	r3,r3,32
443	blr
444
445.Lsrc_error:
446	cmpdi	0,r7,0
447	beqlr
448	li	r6,-EFAULT
449	stw	r6,0(r7)
450	blr
451
452.Ldest_error:
453	cmpdi	0,r8,0
454	beqlr
455	li	r6,-EFAULT
456	stw	r6,0(r8)
457	blr
458