xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision 95e9fd10)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed.  So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28	lwz	r0,0(r3)
29	lwzu	r5,4(r3)
30	addic.	r4,r4,-2
31	addc	r0,r0,r5
32	mtctr	r4
33	blelr-
341:	lwzu	r4,4(r3)
35	adde	r0,r0,r4
36	bdnz	1b
37	addze	r0,r0		/* add in final carry */
38        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
39        add     r0,r0,r4
40        srdi    r0,r0,32
41	rlwinm	r3,r0,16,0,31	/* fold two halves together */
42	add	r3,r0,r3
43	not	r3,r3
44	srwi	r3,r3,16
45	blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
56	addc	r0,r3,r4	/* add 4 32-bit words together */
57	adde	r0,r0,r5
58	adde	r0,r0,r7
59        rldicl  r4,r0,32,0      /* fold 64 bit value */
60        add     r0,r4,r0
61        srdi    r0,r0,32
62	rlwinm	r3,r0,16,0,31	/* fold two halves together */
63	add	r3,r0,r3
64	not	r3,r3
65	srwi	r3,r3,16
66	blr
67
68/*
69 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit).
71 *
72 * csum_partial(r3=buff, r4=len, r5=sum)
73 */
74_GLOBAL(csum_partial)
75	addic	r0,r5,0			/* clear carry */
76
77	srdi.	r6,r4,3			/* less than 8 bytes? */
78	beq	.Lcsum_tail_word
79
80	/*
81	 * If only halfword aligned, align to a double word. Since odd
82	 * aligned addresses should be rare and they would require more
83	 * work to calculate the correct checksum, we ignore that case
84	 * and take the potential slowdown of unaligned loads.
85	 */
86	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
87	beq	.Lcsum_aligned
88
89	li	r7,4
90	sub	r6,r7,r6
91	mtctr	r6
92
931:
94	lhz	r6,0(r3)		/* align to doubleword */
95	subi	r4,r4,2
96	addi	r3,r3,2
97	adde	r0,r0,r6
98	bdnz	1b
99
100.Lcsum_aligned:
101	/*
102	 * We unroll the loop such that each iteration is 64 bytes with an
103	 * entry and exit limb of 64 bytes, meaning a minimum size of
104	 * 128 bytes.
105	 */
106	srdi.	r6,r4,7
107	beq	.Lcsum_tail_doublewords		/* len < 128 */
108
109	srdi	r6,r4,6
110	subi	r6,r6,1
111	mtctr	r6
112
113	stdu	r1,-STACKFRAMESIZE(r1)
114	std	r14,STK_REG(R14)(r1)
115	std	r15,STK_REG(R15)(r1)
116	std	r16,STK_REG(R16)(r1)
117
118	ld	r6,0(r3)
119	ld	r9,8(r3)
120
121	ld	r10,16(r3)
122	ld	r11,24(r3)
123
124	/*
125	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
126	 * the XER dependency. This means the fastest this loop can go is
127	 * 16 cycles per iteration. The scheduling of the loop below has
128	 * been shown to hit this on both POWER6 and POWER7.
129	 */
130	.align 5
1312:
132	adde	r0,r0,r6
133	ld	r12,32(r3)
134	ld	r14,40(r3)
135
136	adde	r0,r0,r9
137	ld	r15,48(r3)
138	ld	r16,56(r3)
139	addi	r3,r3,64
140
141	adde	r0,r0,r10
142
143	adde	r0,r0,r11
144
145	adde	r0,r0,r12
146
147	adde	r0,r0,r14
148
149	adde	r0,r0,r15
150	ld	r6,0(r3)
151	ld	r9,8(r3)
152
153	adde	r0,r0,r16
154	ld	r10,16(r3)
155	ld	r11,24(r3)
156	bdnz	2b
157
158
159	adde	r0,r0,r6
160	ld	r12,32(r3)
161	ld	r14,40(r3)
162
163	adde	r0,r0,r9
164	ld	r15,48(r3)
165	ld	r16,56(r3)
166	addi	r3,r3,64
167
168	adde	r0,r0,r10
169	adde	r0,r0,r11
170	adde	r0,r0,r12
171	adde	r0,r0,r14
172	adde	r0,r0,r15
173	adde	r0,r0,r16
174
175	ld	r14,STK_REG(R14)(r1)
176	ld	r15,STK_REG(R15)(r1)
177	ld	r16,STK_REG(R16)(r1)
178	addi	r1,r1,STACKFRAMESIZE
179
180	andi.	r4,r4,63
181
182.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
183	srdi.	r6,r4,3
184	beq	.Lcsum_tail_word
185
186	mtctr	r6
1873:
188	ld	r6,0(r3)
189	addi	r3,r3,8
190	adde	r0,r0,r6
191	bdnz	3b
192
193	andi.	r4,r4,7
194
195.Lcsum_tail_word:			/* Up to 7 bytes to go */
196	srdi.	r6,r4,2
197	beq	.Lcsum_tail_halfword
198
199	lwz	r6,0(r3)
200	addi	r3,r3,4
201	adde	r0,r0,r6
202	subi	r4,r4,4
203
204.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
205	srdi.	r6,r4,1
206	beq	.Lcsum_tail_byte
207
208	lhz	r6,0(r3)
209	addi	r3,r3,2
210	adde	r0,r0,r6
211	subi	r4,r4,2
212
213.Lcsum_tail_byte:			/* Up to 1 byte to go */
214	andi.	r6,r4,1
215	beq	.Lcsum_finish
216
217	lbz	r6,0(r3)
218	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
219	adde	r0,r0,r9
220
221.Lcsum_finish:
222	addze	r0,r0			/* add in final carry */
223	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
224	add	r3,r4,r0
225	srdi	r3,r3,32
226	blr
227
228
229	.macro source
230100:
231	.section __ex_table,"a"
232	.align 3
233	.llong 100b,.Lsrc_error
234	.previous
235	.endm
236
237	.macro dest
238200:
239	.section __ex_table,"a"
240	.align 3
241	.llong 200b,.Ldest_error
242	.previous
243	.endm
244
245/*
246 * Computes the checksum of a memory block at src, length len,
247 * and adds in "sum" (32-bit), while copying the block to dst.
248 * If an access exception occurs on src or dst, it stores -EFAULT
249 * to *src_err or *dst_err respectively. The caller must take any action
250 * required in this case (zeroing memory, recalculating partial checksum etc).
251 *
252 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
253 */
254_GLOBAL(csum_partial_copy_generic)
255	addic	r0,r6,0			/* clear carry */
256
257	srdi.	r6,r5,3			/* less than 8 bytes? */
258	beq	.Lcopy_tail_word
259
260	/*
261	 * If only halfword aligned, align to a double word. Since odd
262	 * aligned addresses should be rare and they would require more
263	 * work to calculate the correct checksum, we ignore that case
264	 * and take the potential slowdown of unaligned loads.
265	 *
266	 * If the source and destination are relatively unaligned we only
267	 * align the source. This keeps things simple.
268	 */
269	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
270	beq	.Lcopy_aligned
271
272	li	r7,4
273	sub	r6,r7,r6
274	mtctr	r6
275
2761:
277source;	lhz	r6,0(r3)		/* align to doubleword */
278	subi	r5,r5,2
279	addi	r3,r3,2
280	adde	r0,r0,r6
281dest;	sth	r6,0(r4)
282	addi	r4,r4,2
283	bdnz	1b
284
285.Lcopy_aligned:
286	/*
287	 * We unroll the loop such that each iteration is 64 bytes with an
288	 * entry and exit limb of 64 bytes, meaning a minimum size of
289	 * 128 bytes.
290	 */
291	srdi.	r6,r5,7
292	beq	.Lcopy_tail_doublewords		/* len < 128 */
293
294	srdi	r6,r5,6
295	subi	r6,r6,1
296	mtctr	r6
297
298	stdu	r1,-STACKFRAMESIZE(r1)
299	std	r14,STK_REG(R14)(r1)
300	std	r15,STK_REG(R15)(r1)
301	std	r16,STK_REG(R16)(r1)
302
303source;	ld	r6,0(r3)
304source;	ld	r9,8(r3)
305
306source;	ld	r10,16(r3)
307source;	ld	r11,24(r3)
308
309	/*
310	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
311	 * the XER dependency. This means the fastest this loop can go is
312	 * 16 cycles per iteration. The scheduling of the loop below has
313	 * been shown to hit this on both POWER6 and POWER7.
314	 */
315	.align 5
3162:
317	adde	r0,r0,r6
318source;	ld	r12,32(r3)
319source;	ld	r14,40(r3)
320
321	adde	r0,r0,r9
322source;	ld	r15,48(r3)
323source;	ld	r16,56(r3)
324	addi	r3,r3,64
325
326	adde	r0,r0,r10
327dest;	std	r6,0(r4)
328dest;	std	r9,8(r4)
329
330	adde	r0,r0,r11
331dest;	std	r10,16(r4)
332dest;	std	r11,24(r4)
333
334	adde	r0,r0,r12
335dest;	std	r12,32(r4)
336dest;	std	r14,40(r4)
337
338	adde	r0,r0,r14
339dest;	std	r15,48(r4)
340dest;	std	r16,56(r4)
341	addi	r4,r4,64
342
343	adde	r0,r0,r15
344source;	ld	r6,0(r3)
345source;	ld	r9,8(r3)
346
347	adde	r0,r0,r16
348source;	ld	r10,16(r3)
349source;	ld	r11,24(r3)
350	bdnz	2b
351
352
353	adde	r0,r0,r6
354source;	ld	r12,32(r3)
355source;	ld	r14,40(r3)
356
357	adde	r0,r0,r9
358source;	ld	r15,48(r3)
359source;	ld	r16,56(r3)
360	addi	r3,r3,64
361
362	adde	r0,r0,r10
363dest;	std	r6,0(r4)
364dest;	std	r9,8(r4)
365
366	adde	r0,r0,r11
367dest;	std	r10,16(r4)
368dest;	std	r11,24(r4)
369
370	adde	r0,r0,r12
371dest;	std	r12,32(r4)
372dest;	std	r14,40(r4)
373
374	adde	r0,r0,r14
375dest;	std	r15,48(r4)
376dest;	std	r16,56(r4)
377	addi	r4,r4,64
378
379	adde	r0,r0,r15
380	adde	r0,r0,r16
381
382	ld	r14,STK_REG(R14)(r1)
383	ld	r15,STK_REG(R15)(r1)
384	ld	r16,STK_REG(R16)(r1)
385	addi	r1,r1,STACKFRAMESIZE
386
387	andi.	r5,r5,63
388
389.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
390	srdi.	r6,r5,3
391	beq	.Lcopy_tail_word
392
393	mtctr	r6
3943:
395source;	ld	r6,0(r3)
396	addi	r3,r3,8
397	adde	r0,r0,r6
398dest;	std	r6,0(r4)
399	addi	r4,r4,8
400	bdnz	3b
401
402	andi.	r5,r5,7
403
404.Lcopy_tail_word:			/* Up to 7 bytes to go */
405	srdi.	r6,r5,2
406	beq	.Lcopy_tail_halfword
407
408source;	lwz	r6,0(r3)
409	addi	r3,r3,4
410	adde	r0,r0,r6
411dest;	stw	r6,0(r4)
412	addi	r4,r4,4
413	subi	r5,r5,4
414
415.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
416	srdi.	r6,r5,1
417	beq	.Lcopy_tail_byte
418
419source;	lhz	r6,0(r3)
420	addi	r3,r3,2
421	adde	r0,r0,r6
422dest;	sth	r6,0(r4)
423	addi	r4,r4,2
424	subi	r5,r5,2
425
426.Lcopy_tail_byte:			/* Up to 1 byte to go */
427	andi.	r6,r5,1
428	beq	.Lcopy_finish
429
430source;	lbz	r6,0(r3)
431	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
432	adde	r0,r0,r9
433dest;	stb	r6,0(r4)
434
435.Lcopy_finish:
436	addze	r0,r0			/* add in final carry */
437	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
438	add	r3,r4,r0
439	srdi	r3,r3,32
440	blr
441
442.Lsrc_error:
443	cmpdi	0,r7,0
444	beqlr
445	li	r6,-EFAULT
446	stw	r6,0(r7)
447	blr
448
449.Ldest_error:
450	cmpdi	0,r8,0
451	beqlr
452	li	r6,-EFAULT
453	stw	r6,0(r8)
454	blr
455