xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision 6a551c11)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * Computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit).
23 *
24 * __csum_partial(r3=buff, r4=len, r5=sum)
25 */
26_GLOBAL(__csum_partial)
27	addic	r0,r5,0			/* clear carry */
28
29	srdi.	r6,r4,3			/* less than 8 bytes? */
30	beq	.Lcsum_tail_word
31
32	/*
33	 * If only halfword aligned, align to a double word. Since odd
34	 * aligned addresses should be rare and they would require more
35	 * work to calculate the correct checksum, we ignore that case
36	 * and take the potential slowdown of unaligned loads.
37	 */
38	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
39	beq	.Lcsum_aligned
40
41	li	r7,4
42	sub	r6,r7,r6
43	mtctr	r6
44
451:
46	lhz	r6,0(r3)		/* align to doubleword */
47	subi	r4,r4,2
48	addi	r3,r3,2
49	adde	r0,r0,r6
50	bdnz	1b
51
52.Lcsum_aligned:
53	/*
54	 * We unroll the loop such that each iteration is 64 bytes with an
55	 * entry and exit limb of 64 bytes, meaning a minimum size of
56	 * 128 bytes.
57	 */
58	srdi.	r6,r4,7
59	beq	.Lcsum_tail_doublewords		/* len < 128 */
60
61	srdi	r6,r4,6
62	subi	r6,r6,1
63	mtctr	r6
64
65	stdu	r1,-STACKFRAMESIZE(r1)
66	std	r14,STK_REG(R14)(r1)
67	std	r15,STK_REG(R15)(r1)
68	std	r16,STK_REG(R16)(r1)
69
70	ld	r6,0(r3)
71	ld	r9,8(r3)
72
73	ld	r10,16(r3)
74	ld	r11,24(r3)
75
76	/*
77	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
78	 * the XER dependency. This means the fastest this loop can go is
79	 * 16 cycles per iteration. The scheduling of the loop below has
80	 * been shown to hit this on both POWER6 and POWER7.
81	 */
82	.align 5
832:
84	adde	r0,r0,r6
85	ld	r12,32(r3)
86	ld	r14,40(r3)
87
88	adde	r0,r0,r9
89	ld	r15,48(r3)
90	ld	r16,56(r3)
91	addi	r3,r3,64
92
93	adde	r0,r0,r10
94
95	adde	r0,r0,r11
96
97	adde	r0,r0,r12
98
99	adde	r0,r0,r14
100
101	adde	r0,r0,r15
102	ld	r6,0(r3)
103	ld	r9,8(r3)
104
105	adde	r0,r0,r16
106	ld	r10,16(r3)
107	ld	r11,24(r3)
108	bdnz	2b
109
110
111	adde	r0,r0,r6
112	ld	r12,32(r3)
113	ld	r14,40(r3)
114
115	adde	r0,r0,r9
116	ld	r15,48(r3)
117	ld	r16,56(r3)
118	addi	r3,r3,64
119
120	adde	r0,r0,r10
121	adde	r0,r0,r11
122	adde	r0,r0,r12
123	adde	r0,r0,r14
124	adde	r0,r0,r15
125	adde	r0,r0,r16
126
127	ld	r14,STK_REG(R14)(r1)
128	ld	r15,STK_REG(R15)(r1)
129	ld	r16,STK_REG(R16)(r1)
130	addi	r1,r1,STACKFRAMESIZE
131
132	andi.	r4,r4,63
133
134.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
135	srdi.	r6,r4,3
136	beq	.Lcsum_tail_word
137
138	mtctr	r6
1393:
140	ld	r6,0(r3)
141	addi	r3,r3,8
142	adde	r0,r0,r6
143	bdnz	3b
144
145	andi.	r4,r4,7
146
147.Lcsum_tail_word:			/* Up to 7 bytes to go */
148	srdi.	r6,r4,2
149	beq	.Lcsum_tail_halfword
150
151	lwz	r6,0(r3)
152	addi	r3,r3,4
153	adde	r0,r0,r6
154	subi	r4,r4,4
155
156.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
157	srdi.	r6,r4,1
158	beq	.Lcsum_tail_byte
159
160	lhz	r6,0(r3)
161	addi	r3,r3,2
162	adde	r0,r0,r6
163	subi	r4,r4,2
164
165.Lcsum_tail_byte:			/* Up to 1 byte to go */
166	andi.	r6,r4,1
167	beq	.Lcsum_finish
168
169	lbz	r6,0(r3)
170	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
171	adde	r0,r0,r9
172
173.Lcsum_finish:
174	addze	r0,r0			/* add in final carry */
175	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
176	add	r3,r4,r0
177	srdi	r3,r3,32
178	blr
179
180
181	.macro srcnr
182100:
183	.section __ex_table,"a"
184	.align 3
185	.llong 100b,.Lsrc_error_nr
186	.previous
187	.endm
188
189	.macro source
190150:
191	.section __ex_table,"a"
192	.align 3
193	.llong 150b,.Lsrc_error
194	.previous
195	.endm
196
197	.macro dstnr
198200:
199	.section __ex_table,"a"
200	.align 3
201	.llong 200b,.Ldest_error_nr
202	.previous
203	.endm
204
205	.macro dest
206250:
207	.section __ex_table,"a"
208	.align 3
209	.llong 250b,.Ldest_error
210	.previous
211	.endm
212
213/*
214 * Computes the checksum of a memory block at src, length len,
215 * and adds in "sum" (32-bit), while copying the block to dst.
216 * If an access exception occurs on src or dst, it stores -EFAULT
217 * to *src_err or *dst_err respectively. The caller must take any action
218 * required in this case (zeroing memory, recalculating partial checksum etc).
219 *
220 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
221 */
222_GLOBAL(csum_partial_copy_generic)
223	addic	r0,r6,0			/* clear carry */
224
225	srdi.	r6,r5,3			/* less than 8 bytes? */
226	beq	.Lcopy_tail_word
227
228	/*
229	 * If only halfword aligned, align to a double word. Since odd
230	 * aligned addresses should be rare and they would require more
231	 * work to calculate the correct checksum, we ignore that case
232	 * and take the potential slowdown of unaligned loads.
233	 *
234	 * If the source and destination are relatively unaligned we only
235	 * align the source. This keeps things simple.
236	 */
237	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
238	beq	.Lcopy_aligned
239
240	li	r9,4
241	sub	r6,r9,r6
242	mtctr	r6
243
2441:
245srcnr;	lhz	r6,0(r3)		/* align to doubleword */
246	subi	r5,r5,2
247	addi	r3,r3,2
248	adde	r0,r0,r6
249dstnr;	sth	r6,0(r4)
250	addi	r4,r4,2
251	bdnz	1b
252
253.Lcopy_aligned:
254	/*
255	 * We unroll the loop such that each iteration is 64 bytes with an
256	 * entry and exit limb of 64 bytes, meaning a minimum size of
257	 * 128 bytes.
258	 */
259	srdi.	r6,r5,7
260	beq	.Lcopy_tail_doublewords		/* len < 128 */
261
262	srdi	r6,r5,6
263	subi	r6,r6,1
264	mtctr	r6
265
266	stdu	r1,-STACKFRAMESIZE(r1)
267	std	r14,STK_REG(R14)(r1)
268	std	r15,STK_REG(R15)(r1)
269	std	r16,STK_REG(R16)(r1)
270
271source;	ld	r6,0(r3)
272source;	ld	r9,8(r3)
273
274source;	ld	r10,16(r3)
275source;	ld	r11,24(r3)
276
277	/*
278	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
279	 * the XER dependency. This means the fastest this loop can go is
280	 * 16 cycles per iteration. The scheduling of the loop below has
281	 * been shown to hit this on both POWER6 and POWER7.
282	 */
283	.align 5
2842:
285	adde	r0,r0,r6
286source;	ld	r12,32(r3)
287source;	ld	r14,40(r3)
288
289	adde	r0,r0,r9
290source;	ld	r15,48(r3)
291source;	ld	r16,56(r3)
292	addi	r3,r3,64
293
294	adde	r0,r0,r10
295dest;	std	r6,0(r4)
296dest;	std	r9,8(r4)
297
298	adde	r0,r0,r11
299dest;	std	r10,16(r4)
300dest;	std	r11,24(r4)
301
302	adde	r0,r0,r12
303dest;	std	r12,32(r4)
304dest;	std	r14,40(r4)
305
306	adde	r0,r0,r14
307dest;	std	r15,48(r4)
308dest;	std	r16,56(r4)
309	addi	r4,r4,64
310
311	adde	r0,r0,r15
312source;	ld	r6,0(r3)
313source;	ld	r9,8(r3)
314
315	adde	r0,r0,r16
316source;	ld	r10,16(r3)
317source;	ld	r11,24(r3)
318	bdnz	2b
319
320
321	adde	r0,r0,r6
322source;	ld	r12,32(r3)
323source;	ld	r14,40(r3)
324
325	adde	r0,r0,r9
326source;	ld	r15,48(r3)
327source;	ld	r16,56(r3)
328	addi	r3,r3,64
329
330	adde	r0,r0,r10
331dest;	std	r6,0(r4)
332dest;	std	r9,8(r4)
333
334	adde	r0,r0,r11
335dest;	std	r10,16(r4)
336dest;	std	r11,24(r4)
337
338	adde	r0,r0,r12
339dest;	std	r12,32(r4)
340dest;	std	r14,40(r4)
341
342	adde	r0,r0,r14
343dest;	std	r15,48(r4)
344dest;	std	r16,56(r4)
345	addi	r4,r4,64
346
347	adde	r0,r0,r15
348	adde	r0,r0,r16
349
350	ld	r14,STK_REG(R14)(r1)
351	ld	r15,STK_REG(R15)(r1)
352	ld	r16,STK_REG(R16)(r1)
353	addi	r1,r1,STACKFRAMESIZE
354
355	andi.	r5,r5,63
356
357.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
358	srdi.	r6,r5,3
359	beq	.Lcopy_tail_word
360
361	mtctr	r6
3623:
363srcnr;	ld	r6,0(r3)
364	addi	r3,r3,8
365	adde	r0,r0,r6
366dstnr;	std	r6,0(r4)
367	addi	r4,r4,8
368	bdnz	3b
369
370	andi.	r5,r5,7
371
372.Lcopy_tail_word:			/* Up to 7 bytes to go */
373	srdi.	r6,r5,2
374	beq	.Lcopy_tail_halfword
375
376srcnr;	lwz	r6,0(r3)
377	addi	r3,r3,4
378	adde	r0,r0,r6
379dstnr;	stw	r6,0(r4)
380	addi	r4,r4,4
381	subi	r5,r5,4
382
383.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
384	srdi.	r6,r5,1
385	beq	.Lcopy_tail_byte
386
387srcnr;	lhz	r6,0(r3)
388	addi	r3,r3,2
389	adde	r0,r0,r6
390dstnr;	sth	r6,0(r4)
391	addi	r4,r4,2
392	subi	r5,r5,2
393
394.Lcopy_tail_byte:			/* Up to 1 byte to go */
395	andi.	r6,r5,1
396	beq	.Lcopy_finish
397
398srcnr;	lbz	r6,0(r3)
399	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
400	adde	r0,r0,r9
401dstnr;	stb	r6,0(r4)
402
403.Lcopy_finish:
404	addze	r0,r0			/* add in final carry */
405	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
406	add	r3,r4,r0
407	srdi	r3,r3,32
408	blr
409
410.Lsrc_error:
411	ld	r14,STK_REG(R14)(r1)
412	ld	r15,STK_REG(R15)(r1)
413	ld	r16,STK_REG(R16)(r1)
414	addi	r1,r1,STACKFRAMESIZE
415.Lsrc_error_nr:
416	cmpdi	0,r7,0
417	beqlr
418	li	r6,-EFAULT
419	stw	r6,0(r7)
420	blr
421
422.Ldest_error:
423	ld	r14,STK_REG(R14)(r1)
424	ld	r15,STK_REG(R15)(r1)
425	ld	r16,STK_REG(R16)(r1)
426	addi	r1,r1,STACKFRAMESIZE
427.Ldest_error_nr:
428	cmpdi	0,r8,0
429	beqlr
430	li	r6,-EFAULT
431	stw	r6,0(r8)
432	blr
433