xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision 6a143a7c)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/sys.h>
12#include <asm/processor.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16
17/*
18 * Computes the checksum of a memory block at buff, length len,
19 * and adds in "sum" (32-bit).
20 *
21 * __csum_partial(r3=buff, r4=len, r5=sum)
22 */
23_GLOBAL(__csum_partial)
24	addic	r0,r5,0			/* clear carry */
25
26	srdi.	r6,r4,3			/* less than 8 bytes? */
27	beq	.Lcsum_tail_word
28
29	/*
30	 * If only halfword aligned, align to a double word. Since odd
31	 * aligned addresses should be rare and they would require more
32	 * work to calculate the correct checksum, we ignore that case
33	 * and take the potential slowdown of unaligned loads.
34	 */
35	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
36	beq	.Lcsum_aligned
37
38	li	r7,4
39	sub	r6,r7,r6
40	mtctr	r6
41
421:
43	lhz	r6,0(r3)		/* align to doubleword */
44	subi	r4,r4,2
45	addi	r3,r3,2
46	adde	r0,r0,r6
47	bdnz	1b
48
49.Lcsum_aligned:
50	/*
51	 * We unroll the loop such that each iteration is 64 bytes with an
52	 * entry and exit limb of 64 bytes, meaning a minimum size of
53	 * 128 bytes.
54	 */
55	srdi.	r6,r4,7
56	beq	.Lcsum_tail_doublewords		/* len < 128 */
57
58	srdi	r6,r4,6
59	subi	r6,r6,1
60	mtctr	r6
61
62	stdu	r1,-STACKFRAMESIZE(r1)
63	std	r14,STK_REG(R14)(r1)
64	std	r15,STK_REG(R15)(r1)
65	std	r16,STK_REG(R16)(r1)
66
67	ld	r6,0(r3)
68	ld	r9,8(r3)
69
70	ld	r10,16(r3)
71	ld	r11,24(r3)
72
73	/*
74	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75	 * because of the XER dependency. This means the fastest this loop can
76	 * go is 16 cycles per iteration. The scheduling of the loop below has
77	 * been shown to hit this on both POWER6 and POWER7.
78	 */
79	.align 5
802:
81	adde	r0,r0,r6
82	ld	r12,32(r3)
83	ld	r14,40(r3)
84
85	adde	r0,r0,r9
86	ld	r15,48(r3)
87	ld	r16,56(r3)
88	addi	r3,r3,64
89
90	adde	r0,r0,r10
91
92	adde	r0,r0,r11
93
94	adde	r0,r0,r12
95
96	adde	r0,r0,r14
97
98	adde	r0,r0,r15
99	ld	r6,0(r3)
100	ld	r9,8(r3)
101
102	adde	r0,r0,r16
103	ld	r10,16(r3)
104	ld	r11,24(r3)
105	bdnz	2b
106
107
108	adde	r0,r0,r6
109	ld	r12,32(r3)
110	ld	r14,40(r3)
111
112	adde	r0,r0,r9
113	ld	r15,48(r3)
114	ld	r16,56(r3)
115	addi	r3,r3,64
116
117	adde	r0,r0,r10
118	adde	r0,r0,r11
119	adde	r0,r0,r12
120	adde	r0,r0,r14
121	adde	r0,r0,r15
122	adde	r0,r0,r16
123
124	ld	r14,STK_REG(R14)(r1)
125	ld	r15,STK_REG(R15)(r1)
126	ld	r16,STK_REG(R16)(r1)
127	addi	r1,r1,STACKFRAMESIZE
128
129	andi.	r4,r4,63
130
131.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
132	srdi.	r6,r4,3
133	beq	.Lcsum_tail_word
134
135	mtctr	r6
1363:
137	ld	r6,0(r3)
138	addi	r3,r3,8
139	adde	r0,r0,r6
140	bdnz	3b
141
142	andi.	r4,r4,7
143
144.Lcsum_tail_word:			/* Up to 7 bytes to go */
145	srdi.	r6,r4,2
146	beq	.Lcsum_tail_halfword
147
148	lwz	r6,0(r3)
149	addi	r3,r3,4
150	adde	r0,r0,r6
151	subi	r4,r4,4
152
153.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
154	srdi.	r6,r4,1
155	beq	.Lcsum_tail_byte
156
157	lhz	r6,0(r3)
158	addi	r3,r3,2
159	adde	r0,r0,r6
160	subi	r4,r4,2
161
162.Lcsum_tail_byte:			/* Up to 1 byte to go */
163	andi.	r6,r4,1
164	beq	.Lcsum_finish
165
166	lbz	r6,0(r3)
167#ifdef __BIG_ENDIAN__
168	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
169	adde	r0,r0,r9
170#else
171	adde	r0,r0,r6
172#endif
173
174.Lcsum_finish:
175	addze	r0,r0			/* add in final carry */
176	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177	add	r3,r4,r0
178	srdi	r3,r3,32
179	blr
180EXPORT_SYMBOL(__csum_partial)
181
182
183	.macro srcnr
184100:
185	EX_TABLE(100b,.Lerror_nr)
186	.endm
187
188	.macro source
189150:
190	EX_TABLE(150b,.Lerror)
191	.endm
192
193	.macro dstnr
194200:
195	EX_TABLE(200b,.Lerror_nr)
196	.endm
197
198	.macro dest
199250:
200	EX_TABLE(250b,.Lerror)
201	.endm
202
203/*
204 * Computes the checksum of a memory block at src, length len,
205 * and adds in 0xffffffff (32-bit), while copying the block to dst.
206 * If an access exception occurs, it returns 0.
207 *
208 * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209 */
210_GLOBAL(csum_partial_copy_generic)
211	li	r6,-1
212	addic	r0,r6,0			/* clear carry */
213
214	srdi.	r6,r5,3			/* less than 8 bytes? */
215	beq	.Lcopy_tail_word
216
217	/*
218	 * If only halfword aligned, align to a double word. Since odd
219	 * aligned addresses should be rare and they would require more
220	 * work to calculate the correct checksum, we ignore that case
221	 * and take the potential slowdown of unaligned loads.
222	 *
223	 * If the source and destination are relatively unaligned we only
224	 * align the source. This keeps things simple.
225	 */
226	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
227	beq	.Lcopy_aligned
228
229	li	r9,4
230	sub	r6,r9,r6
231	mtctr	r6
232
2331:
234srcnr;	lhz	r6,0(r3)		/* align to doubleword */
235	subi	r5,r5,2
236	addi	r3,r3,2
237	adde	r0,r0,r6
238dstnr;	sth	r6,0(r4)
239	addi	r4,r4,2
240	bdnz	1b
241
242.Lcopy_aligned:
243	/*
244	 * We unroll the loop such that each iteration is 64 bytes with an
245	 * entry and exit limb of 64 bytes, meaning a minimum size of
246	 * 128 bytes.
247	 */
248	srdi.	r6,r5,7
249	beq	.Lcopy_tail_doublewords		/* len < 128 */
250
251	srdi	r6,r5,6
252	subi	r6,r6,1
253	mtctr	r6
254
255	stdu	r1,-STACKFRAMESIZE(r1)
256	std	r14,STK_REG(R14)(r1)
257	std	r15,STK_REG(R15)(r1)
258	std	r16,STK_REG(R16)(r1)
259
260source;	ld	r6,0(r3)
261source;	ld	r9,8(r3)
262
263source;	ld	r10,16(r3)
264source;	ld	r11,24(r3)
265
266	/*
267	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268	 * because of the XER dependency. This means the fastest this loop can
269	 * go is 16 cycles per iteration. The scheduling of the loop below has
270	 * been shown to hit this on both POWER6 and POWER7.
271	 */
272	.align 5
2732:
274	adde	r0,r0,r6
275source;	ld	r12,32(r3)
276source;	ld	r14,40(r3)
277
278	adde	r0,r0,r9
279source;	ld	r15,48(r3)
280source;	ld	r16,56(r3)
281	addi	r3,r3,64
282
283	adde	r0,r0,r10
284dest;	std	r6,0(r4)
285dest;	std	r9,8(r4)
286
287	adde	r0,r0,r11
288dest;	std	r10,16(r4)
289dest;	std	r11,24(r4)
290
291	adde	r0,r0,r12
292dest;	std	r12,32(r4)
293dest;	std	r14,40(r4)
294
295	adde	r0,r0,r14
296dest;	std	r15,48(r4)
297dest;	std	r16,56(r4)
298	addi	r4,r4,64
299
300	adde	r0,r0,r15
301source;	ld	r6,0(r3)
302source;	ld	r9,8(r3)
303
304	adde	r0,r0,r16
305source;	ld	r10,16(r3)
306source;	ld	r11,24(r3)
307	bdnz	2b
308
309
310	adde	r0,r0,r6
311source;	ld	r12,32(r3)
312source;	ld	r14,40(r3)
313
314	adde	r0,r0,r9
315source;	ld	r15,48(r3)
316source;	ld	r16,56(r3)
317	addi	r3,r3,64
318
319	adde	r0,r0,r10
320dest;	std	r6,0(r4)
321dest;	std	r9,8(r4)
322
323	adde	r0,r0,r11
324dest;	std	r10,16(r4)
325dest;	std	r11,24(r4)
326
327	adde	r0,r0,r12
328dest;	std	r12,32(r4)
329dest;	std	r14,40(r4)
330
331	adde	r0,r0,r14
332dest;	std	r15,48(r4)
333dest;	std	r16,56(r4)
334	addi	r4,r4,64
335
336	adde	r0,r0,r15
337	adde	r0,r0,r16
338
339	ld	r14,STK_REG(R14)(r1)
340	ld	r15,STK_REG(R15)(r1)
341	ld	r16,STK_REG(R16)(r1)
342	addi	r1,r1,STACKFRAMESIZE
343
344	andi.	r5,r5,63
345
346.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
347	srdi.	r6,r5,3
348	beq	.Lcopy_tail_word
349
350	mtctr	r6
3513:
352srcnr;	ld	r6,0(r3)
353	addi	r3,r3,8
354	adde	r0,r0,r6
355dstnr;	std	r6,0(r4)
356	addi	r4,r4,8
357	bdnz	3b
358
359	andi.	r5,r5,7
360
361.Lcopy_tail_word:			/* Up to 7 bytes to go */
362	srdi.	r6,r5,2
363	beq	.Lcopy_tail_halfword
364
365srcnr;	lwz	r6,0(r3)
366	addi	r3,r3,4
367	adde	r0,r0,r6
368dstnr;	stw	r6,0(r4)
369	addi	r4,r4,4
370	subi	r5,r5,4
371
372.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
373	srdi.	r6,r5,1
374	beq	.Lcopy_tail_byte
375
376srcnr;	lhz	r6,0(r3)
377	addi	r3,r3,2
378	adde	r0,r0,r6
379dstnr;	sth	r6,0(r4)
380	addi	r4,r4,2
381	subi	r5,r5,2
382
383.Lcopy_tail_byte:			/* Up to 1 byte to go */
384	andi.	r6,r5,1
385	beq	.Lcopy_finish
386
387srcnr;	lbz	r6,0(r3)
388#ifdef __BIG_ENDIAN__
389	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
390	adde	r0,r0,r9
391#else
392	adde	r0,r0,r6
393#endif
394dstnr;	stb	r6,0(r4)
395
396.Lcopy_finish:
397	addze	r0,r0			/* add in final carry */
398	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
399	add	r3,r4,r0
400	srdi	r3,r3,32
401	blr
402
403.Lerror:
404	ld	r14,STK_REG(R14)(r1)
405	ld	r15,STK_REG(R15)(r1)
406	ld	r16,STK_REG(R16)(r1)
407	addi	r1,r1,STACKFRAMESIZE
408.Lerror_nr:
409	li	r3,0
410	blr
411
412EXPORT_SYMBOL(csum_partial_copy_generic)
413
414/*
415 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416 *			   const struct in6_addr *daddr,
417 *			   __u32 len, __u8 proto, __wsum sum)
418 */
419
420_GLOBAL(csum_ipv6_magic)
421	ld	r8, 0(r3)
422	ld	r9, 8(r3)
423	add	r5, r5, r6
424	addc	r0, r8, r9
425	ld	r10, 0(r4)
426	ld	r11, 8(r4)
427#ifdef CONFIG_CPU_LITTLE_ENDIAN
428	rotldi	r5, r5, 8
429#endif
430	adde	r0, r0, r10
431	add	r5, r5, r7
432	adde	r0, r0, r11
433	adde	r0, r0, r5
434	addze	r0, r0
435	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
436	add	r3, r0, r3
437	srdi	r0, r3, 32
438	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
439	add	r3, r0, r3
440	not	r3, r3
441	rlwinm	r3, r3, 16, 16, 31
442	blr
443EXPORT_SYMBOL(csum_ipv6_magic)
444