xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision 82003e04)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19#include <asm/export.h>
20
21/*
22 * Computes the checksum of a memory block at buff, length len,
23 * and adds in "sum" (32-bit).
24 *
25 * __csum_partial(r3=buff, r4=len, r5=sum)
26 */
27_GLOBAL(__csum_partial)
28	addic	r0,r5,0			/* clear carry */
29
30	srdi.	r6,r4,3			/* less than 8 bytes? */
31	beq	.Lcsum_tail_word
32
33	/*
34	 * If only halfword aligned, align to a double word. Since odd
35	 * aligned addresses should be rare and they would require more
36	 * work to calculate the correct checksum, we ignore that case
37	 * and take the potential slowdown of unaligned loads.
38	 */
39	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
40	beq	.Lcsum_aligned
41
42	li	r7,4
43	sub	r6,r7,r6
44	mtctr	r6
45
461:
47	lhz	r6,0(r3)		/* align to doubleword */
48	subi	r4,r4,2
49	addi	r3,r3,2
50	adde	r0,r0,r6
51	bdnz	1b
52
53.Lcsum_aligned:
54	/*
55	 * We unroll the loop such that each iteration is 64 bytes with an
56	 * entry and exit limb of 64 bytes, meaning a minimum size of
57	 * 128 bytes.
58	 */
59	srdi.	r6,r4,7
60	beq	.Lcsum_tail_doublewords		/* len < 128 */
61
62	srdi	r6,r4,6
63	subi	r6,r6,1
64	mtctr	r6
65
66	stdu	r1,-STACKFRAMESIZE(r1)
67	std	r14,STK_REG(R14)(r1)
68	std	r15,STK_REG(R15)(r1)
69	std	r16,STK_REG(R16)(r1)
70
71	ld	r6,0(r3)
72	ld	r9,8(r3)
73
74	ld	r10,16(r3)
75	ld	r11,24(r3)
76
77	/*
78	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
79	 * because of the XER dependency. This means the fastest this loop can
80	 * go is 16 cycles per iteration. The scheduling of the loop below has
81	 * been shown to hit this on both POWER6 and POWER7.
82	 */
83	.align 5
842:
85	adde	r0,r0,r6
86	ld	r12,32(r3)
87	ld	r14,40(r3)
88
89	adde	r0,r0,r9
90	ld	r15,48(r3)
91	ld	r16,56(r3)
92	addi	r3,r3,64
93
94	adde	r0,r0,r10
95
96	adde	r0,r0,r11
97
98	adde	r0,r0,r12
99
100	adde	r0,r0,r14
101
102	adde	r0,r0,r15
103	ld	r6,0(r3)
104	ld	r9,8(r3)
105
106	adde	r0,r0,r16
107	ld	r10,16(r3)
108	ld	r11,24(r3)
109	bdnz	2b
110
111
112	adde	r0,r0,r6
113	ld	r12,32(r3)
114	ld	r14,40(r3)
115
116	adde	r0,r0,r9
117	ld	r15,48(r3)
118	ld	r16,56(r3)
119	addi	r3,r3,64
120
121	adde	r0,r0,r10
122	adde	r0,r0,r11
123	adde	r0,r0,r12
124	adde	r0,r0,r14
125	adde	r0,r0,r15
126	adde	r0,r0,r16
127
128	ld	r14,STK_REG(R14)(r1)
129	ld	r15,STK_REG(R15)(r1)
130	ld	r16,STK_REG(R16)(r1)
131	addi	r1,r1,STACKFRAMESIZE
132
133	andi.	r4,r4,63
134
135.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
136	srdi.	r6,r4,3
137	beq	.Lcsum_tail_word
138
139	mtctr	r6
1403:
141	ld	r6,0(r3)
142	addi	r3,r3,8
143	adde	r0,r0,r6
144	bdnz	3b
145
146	andi.	r4,r4,7
147
148.Lcsum_tail_word:			/* Up to 7 bytes to go */
149	srdi.	r6,r4,2
150	beq	.Lcsum_tail_halfword
151
152	lwz	r6,0(r3)
153	addi	r3,r3,4
154	adde	r0,r0,r6
155	subi	r4,r4,4
156
157.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
158	srdi.	r6,r4,1
159	beq	.Lcsum_tail_byte
160
161	lhz	r6,0(r3)
162	addi	r3,r3,2
163	adde	r0,r0,r6
164	subi	r4,r4,2
165
166.Lcsum_tail_byte:			/* Up to 1 byte to go */
167	andi.	r6,r4,1
168	beq	.Lcsum_finish
169
170	lbz	r6,0(r3)
171	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
172	adde	r0,r0,r9
173
174.Lcsum_finish:
175	addze	r0,r0			/* add in final carry */
176	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177	add	r3,r4,r0
178	srdi	r3,r3,32
179	blr
180EXPORT_SYMBOL(__csum_partial)
181
182
183	.macro srcnr
184100:
185	.section __ex_table,"a"
186	.align 3
187	.llong 100b,.Lsrc_error_nr
188	.previous
189	.endm
190
191	.macro source
192150:
193	.section __ex_table,"a"
194	.align 3
195	.llong 150b,.Lsrc_error
196	.previous
197	.endm
198
199	.macro dstnr
200200:
201	.section __ex_table,"a"
202	.align 3
203	.llong 200b,.Ldest_error_nr
204	.previous
205	.endm
206
207	.macro dest
208250:
209	.section __ex_table,"a"
210	.align 3
211	.llong 250b,.Ldest_error
212	.previous
213	.endm
214
215/*
216 * Computes the checksum of a memory block at src, length len,
217 * and adds in "sum" (32-bit), while copying the block to dst.
218 * If an access exception occurs on src or dst, it stores -EFAULT
219 * to *src_err or *dst_err respectively. The caller must take any action
220 * required in this case (zeroing memory, recalculating partial checksum etc).
221 *
222 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
223 */
224_GLOBAL(csum_partial_copy_generic)
225	addic	r0,r6,0			/* clear carry */
226
227	srdi.	r6,r5,3			/* less than 8 bytes? */
228	beq	.Lcopy_tail_word
229
230	/*
231	 * If only halfword aligned, align to a double word. Since odd
232	 * aligned addresses should be rare and they would require more
233	 * work to calculate the correct checksum, we ignore that case
234	 * and take the potential slowdown of unaligned loads.
235	 *
236	 * If the source and destination are relatively unaligned we only
237	 * align the source. This keeps things simple.
238	 */
239	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
240	beq	.Lcopy_aligned
241
242	li	r9,4
243	sub	r6,r9,r6
244	mtctr	r6
245
2461:
247srcnr;	lhz	r6,0(r3)		/* align to doubleword */
248	subi	r5,r5,2
249	addi	r3,r3,2
250	adde	r0,r0,r6
251dstnr;	sth	r6,0(r4)
252	addi	r4,r4,2
253	bdnz	1b
254
255.Lcopy_aligned:
256	/*
257	 * We unroll the loop such that each iteration is 64 bytes with an
258	 * entry and exit limb of 64 bytes, meaning a minimum size of
259	 * 128 bytes.
260	 */
261	srdi.	r6,r5,7
262	beq	.Lcopy_tail_doublewords		/* len < 128 */
263
264	srdi	r6,r5,6
265	subi	r6,r6,1
266	mtctr	r6
267
268	stdu	r1,-STACKFRAMESIZE(r1)
269	std	r14,STK_REG(R14)(r1)
270	std	r15,STK_REG(R15)(r1)
271	std	r16,STK_REG(R16)(r1)
272
273source;	ld	r6,0(r3)
274source;	ld	r9,8(r3)
275
276source;	ld	r10,16(r3)
277source;	ld	r11,24(r3)
278
279	/*
280	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
281	 * because of the XER dependency. This means the fastest this loop can
282	 * go is 16 cycles per iteration. The scheduling of the loop below has
283	 * been shown to hit this on both POWER6 and POWER7.
284	 */
285	.align 5
2862:
287	adde	r0,r0,r6
288source;	ld	r12,32(r3)
289source;	ld	r14,40(r3)
290
291	adde	r0,r0,r9
292source;	ld	r15,48(r3)
293source;	ld	r16,56(r3)
294	addi	r3,r3,64
295
296	adde	r0,r0,r10
297dest;	std	r6,0(r4)
298dest;	std	r9,8(r4)
299
300	adde	r0,r0,r11
301dest;	std	r10,16(r4)
302dest;	std	r11,24(r4)
303
304	adde	r0,r0,r12
305dest;	std	r12,32(r4)
306dest;	std	r14,40(r4)
307
308	adde	r0,r0,r14
309dest;	std	r15,48(r4)
310dest;	std	r16,56(r4)
311	addi	r4,r4,64
312
313	adde	r0,r0,r15
314source;	ld	r6,0(r3)
315source;	ld	r9,8(r3)
316
317	adde	r0,r0,r16
318source;	ld	r10,16(r3)
319source;	ld	r11,24(r3)
320	bdnz	2b
321
322
323	adde	r0,r0,r6
324source;	ld	r12,32(r3)
325source;	ld	r14,40(r3)
326
327	adde	r0,r0,r9
328source;	ld	r15,48(r3)
329source;	ld	r16,56(r3)
330	addi	r3,r3,64
331
332	adde	r0,r0,r10
333dest;	std	r6,0(r4)
334dest;	std	r9,8(r4)
335
336	adde	r0,r0,r11
337dest;	std	r10,16(r4)
338dest;	std	r11,24(r4)
339
340	adde	r0,r0,r12
341dest;	std	r12,32(r4)
342dest;	std	r14,40(r4)
343
344	adde	r0,r0,r14
345dest;	std	r15,48(r4)
346dest;	std	r16,56(r4)
347	addi	r4,r4,64
348
349	adde	r0,r0,r15
350	adde	r0,r0,r16
351
352	ld	r14,STK_REG(R14)(r1)
353	ld	r15,STK_REG(R15)(r1)
354	ld	r16,STK_REG(R16)(r1)
355	addi	r1,r1,STACKFRAMESIZE
356
357	andi.	r5,r5,63
358
359.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
360	srdi.	r6,r5,3
361	beq	.Lcopy_tail_word
362
363	mtctr	r6
3643:
365srcnr;	ld	r6,0(r3)
366	addi	r3,r3,8
367	adde	r0,r0,r6
368dstnr;	std	r6,0(r4)
369	addi	r4,r4,8
370	bdnz	3b
371
372	andi.	r5,r5,7
373
374.Lcopy_tail_word:			/* Up to 7 bytes to go */
375	srdi.	r6,r5,2
376	beq	.Lcopy_tail_halfword
377
378srcnr;	lwz	r6,0(r3)
379	addi	r3,r3,4
380	adde	r0,r0,r6
381dstnr;	stw	r6,0(r4)
382	addi	r4,r4,4
383	subi	r5,r5,4
384
385.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
386	srdi.	r6,r5,1
387	beq	.Lcopy_tail_byte
388
389srcnr;	lhz	r6,0(r3)
390	addi	r3,r3,2
391	adde	r0,r0,r6
392dstnr;	sth	r6,0(r4)
393	addi	r4,r4,2
394	subi	r5,r5,2
395
396.Lcopy_tail_byte:			/* Up to 1 byte to go */
397	andi.	r6,r5,1
398	beq	.Lcopy_finish
399
400srcnr;	lbz	r6,0(r3)
401	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
402	adde	r0,r0,r9
403dstnr;	stb	r6,0(r4)
404
405.Lcopy_finish:
406	addze	r0,r0			/* add in final carry */
407	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
408	add	r3,r4,r0
409	srdi	r3,r3,32
410	blr
411
412.Lsrc_error:
413	ld	r14,STK_REG(R14)(r1)
414	ld	r15,STK_REG(R15)(r1)
415	ld	r16,STK_REG(R16)(r1)
416	addi	r1,r1,STACKFRAMESIZE
417.Lsrc_error_nr:
418	cmpdi	0,r7,0
419	beqlr
420	li	r6,-EFAULT
421	stw	r6,0(r7)
422	blr
423
424.Ldest_error:
425	ld	r14,STK_REG(R14)(r1)
426	ld	r15,STK_REG(R15)(r1)
427	ld	r16,STK_REG(R16)(r1)
428	addi	r1,r1,STACKFRAMESIZE
429.Ldest_error_nr:
430	cmpdi	0,r8,0
431	beqlr
432	li	r6,-EFAULT
433	stw	r6,0(r8)
434	blr
435EXPORT_SYMBOL(csum_partial_copy_generic)
436