xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision a8da474e)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed.  So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28	lwz	r0,0(r3)
29	lwzu	r5,4(r3)
30	addic.	r4,r4,-2
31	addc	r0,r0,r5
32	mtctr	r4
33	blelr-
341:	lwzu	r4,4(r3)
35	adde	r0,r0,r4
36	bdnz	1b
37	addze	r0,r0		/* add in final carry */
38        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
39        add     r0,r0,r4
40        srdi    r0,r0,32
41	rlwinm	r3,r0,16,0,31	/* fold two halves together */
42	add	r3,r0,r3
43	not	r3,r3
44	srwi	r3,r3,16
45	blr
46
47/*
48 * Computes the checksum of a memory block at buff, length len,
49 * and adds in "sum" (32-bit).
50 *
51 * csum_partial(r3=buff, r4=len, r5=sum)
52 */
53_GLOBAL(csum_partial)
54	addic	r0,r5,0			/* clear carry */
55
56	srdi.	r6,r4,3			/* less than 8 bytes? */
57	beq	.Lcsum_tail_word
58
59	/*
60	 * If only halfword aligned, align to a double word. Since odd
61	 * aligned addresses should be rare and they would require more
62	 * work to calculate the correct checksum, we ignore that case
63	 * and take the potential slowdown of unaligned loads.
64	 */
65	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
66	beq	.Lcsum_aligned
67
68	li	r7,4
69	sub	r6,r7,r6
70	mtctr	r6
71
721:
73	lhz	r6,0(r3)		/* align to doubleword */
74	subi	r4,r4,2
75	addi	r3,r3,2
76	adde	r0,r0,r6
77	bdnz	1b
78
79.Lcsum_aligned:
80	/*
81	 * We unroll the loop such that each iteration is 64 bytes with an
82	 * entry and exit limb of 64 bytes, meaning a minimum size of
83	 * 128 bytes.
84	 */
85	srdi.	r6,r4,7
86	beq	.Lcsum_tail_doublewords		/* len < 128 */
87
88	srdi	r6,r4,6
89	subi	r6,r6,1
90	mtctr	r6
91
92	stdu	r1,-STACKFRAMESIZE(r1)
93	std	r14,STK_REG(R14)(r1)
94	std	r15,STK_REG(R15)(r1)
95	std	r16,STK_REG(R16)(r1)
96
97	ld	r6,0(r3)
98	ld	r9,8(r3)
99
100	ld	r10,16(r3)
101	ld	r11,24(r3)
102
103	/*
104	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
105	 * the XER dependency. This means the fastest this loop can go is
106	 * 16 cycles per iteration. The scheduling of the loop below has
107	 * been shown to hit this on both POWER6 and POWER7.
108	 */
109	.align 5
1102:
111	adde	r0,r0,r6
112	ld	r12,32(r3)
113	ld	r14,40(r3)
114
115	adde	r0,r0,r9
116	ld	r15,48(r3)
117	ld	r16,56(r3)
118	addi	r3,r3,64
119
120	adde	r0,r0,r10
121
122	adde	r0,r0,r11
123
124	adde	r0,r0,r12
125
126	adde	r0,r0,r14
127
128	adde	r0,r0,r15
129	ld	r6,0(r3)
130	ld	r9,8(r3)
131
132	adde	r0,r0,r16
133	ld	r10,16(r3)
134	ld	r11,24(r3)
135	bdnz	2b
136
137
138	adde	r0,r0,r6
139	ld	r12,32(r3)
140	ld	r14,40(r3)
141
142	adde	r0,r0,r9
143	ld	r15,48(r3)
144	ld	r16,56(r3)
145	addi	r3,r3,64
146
147	adde	r0,r0,r10
148	adde	r0,r0,r11
149	adde	r0,r0,r12
150	adde	r0,r0,r14
151	adde	r0,r0,r15
152	adde	r0,r0,r16
153
154	ld	r14,STK_REG(R14)(r1)
155	ld	r15,STK_REG(R15)(r1)
156	ld	r16,STK_REG(R16)(r1)
157	addi	r1,r1,STACKFRAMESIZE
158
159	andi.	r4,r4,63
160
161.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
162	srdi.	r6,r4,3
163	beq	.Lcsum_tail_word
164
165	mtctr	r6
1663:
167	ld	r6,0(r3)
168	addi	r3,r3,8
169	adde	r0,r0,r6
170	bdnz	3b
171
172	andi.	r4,r4,7
173
174.Lcsum_tail_word:			/* Up to 7 bytes to go */
175	srdi.	r6,r4,2
176	beq	.Lcsum_tail_halfword
177
178	lwz	r6,0(r3)
179	addi	r3,r3,4
180	adde	r0,r0,r6
181	subi	r4,r4,4
182
183.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
184	srdi.	r6,r4,1
185	beq	.Lcsum_tail_byte
186
187	lhz	r6,0(r3)
188	addi	r3,r3,2
189	adde	r0,r0,r6
190	subi	r4,r4,2
191
192.Lcsum_tail_byte:			/* Up to 1 byte to go */
193	andi.	r6,r4,1
194	beq	.Lcsum_finish
195
196	lbz	r6,0(r3)
197	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
198	adde	r0,r0,r9
199
200.Lcsum_finish:
201	addze	r0,r0			/* add in final carry */
202	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
203	add	r3,r4,r0
204	srdi	r3,r3,32
205	blr
206
207
208	.macro srcnr
209100:
210	.section __ex_table,"a"
211	.align 3
212	.llong 100b,.Lsrc_error_nr
213	.previous
214	.endm
215
216	.macro source
217150:
218	.section __ex_table,"a"
219	.align 3
220	.llong 150b,.Lsrc_error
221	.previous
222	.endm
223
224	.macro dstnr
225200:
226	.section __ex_table,"a"
227	.align 3
228	.llong 200b,.Ldest_error_nr
229	.previous
230	.endm
231
232	.macro dest
233250:
234	.section __ex_table,"a"
235	.align 3
236	.llong 250b,.Ldest_error
237	.previous
238	.endm
239
240/*
241 * Computes the checksum of a memory block at src, length len,
242 * and adds in "sum" (32-bit), while copying the block to dst.
243 * If an access exception occurs on src or dst, it stores -EFAULT
244 * to *src_err or *dst_err respectively. The caller must take any action
245 * required in this case (zeroing memory, recalculating partial checksum etc).
246 *
247 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
248 */
249_GLOBAL(csum_partial_copy_generic)
250	addic	r0,r6,0			/* clear carry */
251
252	srdi.	r6,r5,3			/* less than 8 bytes? */
253	beq	.Lcopy_tail_word
254
255	/*
256	 * If only halfword aligned, align to a double word. Since odd
257	 * aligned addresses should be rare and they would require more
258	 * work to calculate the correct checksum, we ignore that case
259	 * and take the potential slowdown of unaligned loads.
260	 *
261	 * If the source and destination are relatively unaligned we only
262	 * align the source. This keeps things simple.
263	 */
264	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
265	beq	.Lcopy_aligned
266
267	li	r9,4
268	sub	r6,r9,r6
269	mtctr	r6
270
2711:
272srcnr;	lhz	r6,0(r3)		/* align to doubleword */
273	subi	r5,r5,2
274	addi	r3,r3,2
275	adde	r0,r0,r6
276dstnr;	sth	r6,0(r4)
277	addi	r4,r4,2
278	bdnz	1b
279
280.Lcopy_aligned:
281	/*
282	 * We unroll the loop such that each iteration is 64 bytes with an
283	 * entry and exit limb of 64 bytes, meaning a minimum size of
284	 * 128 bytes.
285	 */
286	srdi.	r6,r5,7
287	beq	.Lcopy_tail_doublewords		/* len < 128 */
288
289	srdi	r6,r5,6
290	subi	r6,r6,1
291	mtctr	r6
292
293	stdu	r1,-STACKFRAMESIZE(r1)
294	std	r14,STK_REG(R14)(r1)
295	std	r15,STK_REG(R15)(r1)
296	std	r16,STK_REG(R16)(r1)
297
298source;	ld	r6,0(r3)
299source;	ld	r9,8(r3)
300
301source;	ld	r10,16(r3)
302source;	ld	r11,24(r3)
303
304	/*
305	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
306	 * the XER dependency. This means the fastest this loop can go is
307	 * 16 cycles per iteration. The scheduling of the loop below has
308	 * been shown to hit this on both POWER6 and POWER7.
309	 */
310	.align 5
3112:
312	adde	r0,r0,r6
313source;	ld	r12,32(r3)
314source;	ld	r14,40(r3)
315
316	adde	r0,r0,r9
317source;	ld	r15,48(r3)
318source;	ld	r16,56(r3)
319	addi	r3,r3,64
320
321	adde	r0,r0,r10
322dest;	std	r6,0(r4)
323dest;	std	r9,8(r4)
324
325	adde	r0,r0,r11
326dest;	std	r10,16(r4)
327dest;	std	r11,24(r4)
328
329	adde	r0,r0,r12
330dest;	std	r12,32(r4)
331dest;	std	r14,40(r4)
332
333	adde	r0,r0,r14
334dest;	std	r15,48(r4)
335dest;	std	r16,56(r4)
336	addi	r4,r4,64
337
338	adde	r0,r0,r15
339source;	ld	r6,0(r3)
340source;	ld	r9,8(r3)
341
342	adde	r0,r0,r16
343source;	ld	r10,16(r3)
344source;	ld	r11,24(r3)
345	bdnz	2b
346
347
348	adde	r0,r0,r6
349source;	ld	r12,32(r3)
350source;	ld	r14,40(r3)
351
352	adde	r0,r0,r9
353source;	ld	r15,48(r3)
354source;	ld	r16,56(r3)
355	addi	r3,r3,64
356
357	adde	r0,r0,r10
358dest;	std	r6,0(r4)
359dest;	std	r9,8(r4)
360
361	adde	r0,r0,r11
362dest;	std	r10,16(r4)
363dest;	std	r11,24(r4)
364
365	adde	r0,r0,r12
366dest;	std	r12,32(r4)
367dest;	std	r14,40(r4)
368
369	adde	r0,r0,r14
370dest;	std	r15,48(r4)
371dest;	std	r16,56(r4)
372	addi	r4,r4,64
373
374	adde	r0,r0,r15
375	adde	r0,r0,r16
376
377	ld	r14,STK_REG(R14)(r1)
378	ld	r15,STK_REG(R15)(r1)
379	ld	r16,STK_REG(R16)(r1)
380	addi	r1,r1,STACKFRAMESIZE
381
382	andi.	r5,r5,63
383
384.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
385	srdi.	r6,r5,3
386	beq	.Lcopy_tail_word
387
388	mtctr	r6
3893:
390srcnr;	ld	r6,0(r3)
391	addi	r3,r3,8
392	adde	r0,r0,r6
393dstnr;	std	r6,0(r4)
394	addi	r4,r4,8
395	bdnz	3b
396
397	andi.	r5,r5,7
398
399.Lcopy_tail_word:			/* Up to 7 bytes to go */
400	srdi.	r6,r5,2
401	beq	.Lcopy_tail_halfword
402
403srcnr;	lwz	r6,0(r3)
404	addi	r3,r3,4
405	adde	r0,r0,r6
406dstnr;	stw	r6,0(r4)
407	addi	r4,r4,4
408	subi	r5,r5,4
409
410.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
411	srdi.	r6,r5,1
412	beq	.Lcopy_tail_byte
413
414srcnr;	lhz	r6,0(r3)
415	addi	r3,r3,2
416	adde	r0,r0,r6
417dstnr;	sth	r6,0(r4)
418	addi	r4,r4,2
419	subi	r5,r5,2
420
421.Lcopy_tail_byte:			/* Up to 1 byte to go */
422	andi.	r6,r5,1
423	beq	.Lcopy_finish
424
425srcnr;	lbz	r6,0(r3)
426	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
427	adde	r0,r0,r9
428dstnr;	stb	r6,0(r4)
429
430.Lcopy_finish:
431	addze	r0,r0			/* add in final carry */
432	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
433	add	r3,r4,r0
434	srdi	r3,r3,32
435	blr
436
437.Lsrc_error:
438	ld	r14,STK_REG(R14)(r1)
439	ld	r15,STK_REG(R15)(r1)
440	ld	r16,STK_REG(R16)(r1)
441	addi	r1,r1,STACKFRAMESIZE
442.Lsrc_error_nr:
443	cmpdi	0,r7,0
444	beqlr
445	li	r6,-EFAULT
446	stw	r6,0(r7)
447	blr
448
449.Ldest_error:
450	ld	r14,STK_REG(R14)(r1)
451	ld	r15,STK_REG(R15)(r1)
452	ld	r16,STK_REG(R16)(r1)
453	addi	r1,r1,STACKFRAMESIZE
454.Ldest_error_nr:
455	cmpdi	0,r8,0
456	beqlr
457	li	r6,-EFAULT
458	stw	r6,0(r8)
459	blr
460