xref: /openbmc/linux/arch/powerpc/lib/checksum_64.S (revision 2f0f2441b4a10948e2ec042b48fef13680387f7c)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/sys.h>
12#include <asm/processor.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16
17/*
18 * Computes the checksum of a memory block at buff, length len,
19 * and adds in "sum" (32-bit).
20 *
21 * __csum_partial(r3=buff, r4=len, r5=sum)
22 */
23_GLOBAL(__csum_partial)
24	addic	r0,r5,0			/* clear carry */
25
26	srdi.	r6,r4,3			/* less than 8 bytes? */
27	beq	.Lcsum_tail_word
28
29	/*
30	 * If only halfword aligned, align to a double word. Since odd
31	 * aligned addresses should be rare and they would require more
32	 * work to calculate the correct checksum, we ignore that case
33	 * and take the potential slowdown of unaligned loads.
34	 */
35	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
36	beq	.Lcsum_aligned
37
38	li	r7,4
39	sub	r6,r7,r6
40	mtctr	r6
41
421:
43	lhz	r6,0(r3)		/* align to doubleword */
44	subi	r4,r4,2
45	addi	r3,r3,2
46	adde	r0,r0,r6
47	bdnz	1b
48
49.Lcsum_aligned:
50	/*
51	 * We unroll the loop such that each iteration is 64 bytes with an
52	 * entry and exit limb of 64 bytes, meaning a minimum size of
53	 * 128 bytes.
54	 */
55	srdi.	r6,r4,7
56	beq	.Lcsum_tail_doublewords		/* len < 128 */
57
58	srdi	r6,r4,6
59	subi	r6,r6,1
60	mtctr	r6
61
62	stdu	r1,-STACKFRAMESIZE(r1)
63	std	r14,STK_REG(R14)(r1)
64	std	r15,STK_REG(R15)(r1)
65	std	r16,STK_REG(R16)(r1)
66
67	ld	r6,0(r3)
68	ld	r9,8(r3)
69
70	ld	r10,16(r3)
71	ld	r11,24(r3)
72
73	/*
74	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75	 * because of the XER dependency. This means the fastest this loop can
76	 * go is 16 cycles per iteration. The scheduling of the loop below has
77	 * been shown to hit this on both POWER6 and POWER7.
78	 */
79	.align 5
802:
81	adde	r0,r0,r6
82	ld	r12,32(r3)
83	ld	r14,40(r3)
84
85	adde	r0,r0,r9
86	ld	r15,48(r3)
87	ld	r16,56(r3)
88	addi	r3,r3,64
89
90	adde	r0,r0,r10
91
92	adde	r0,r0,r11
93
94	adde	r0,r0,r12
95
96	adde	r0,r0,r14
97
98	adde	r0,r0,r15
99	ld	r6,0(r3)
100	ld	r9,8(r3)
101
102	adde	r0,r0,r16
103	ld	r10,16(r3)
104	ld	r11,24(r3)
105	bdnz	2b
106
107
108	adde	r0,r0,r6
109	ld	r12,32(r3)
110	ld	r14,40(r3)
111
112	adde	r0,r0,r9
113	ld	r15,48(r3)
114	ld	r16,56(r3)
115	addi	r3,r3,64
116
117	adde	r0,r0,r10
118	adde	r0,r0,r11
119	adde	r0,r0,r12
120	adde	r0,r0,r14
121	adde	r0,r0,r15
122	adde	r0,r0,r16
123
124	ld	r14,STK_REG(R14)(r1)
125	ld	r15,STK_REG(R15)(r1)
126	ld	r16,STK_REG(R16)(r1)
127	addi	r1,r1,STACKFRAMESIZE
128
129	andi.	r4,r4,63
130
131.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
132	srdi.	r6,r4,3
133	beq	.Lcsum_tail_word
134
135	mtctr	r6
1363:
137	ld	r6,0(r3)
138	addi	r3,r3,8
139	adde	r0,r0,r6
140	bdnz	3b
141
142	andi.	r4,r4,7
143
144.Lcsum_tail_word:			/* Up to 7 bytes to go */
145	srdi.	r6,r4,2
146	beq	.Lcsum_tail_halfword
147
148	lwz	r6,0(r3)
149	addi	r3,r3,4
150	adde	r0,r0,r6
151	subi	r4,r4,4
152
153.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
154	srdi.	r6,r4,1
155	beq	.Lcsum_tail_byte
156
157	lhz	r6,0(r3)
158	addi	r3,r3,2
159	adde	r0,r0,r6
160	subi	r4,r4,2
161
162.Lcsum_tail_byte:			/* Up to 1 byte to go */
163	andi.	r6,r4,1
164	beq	.Lcsum_finish
165
166	lbz	r6,0(r3)
167#ifdef __BIG_ENDIAN__
168	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
169	adde	r0,r0,r9
170#else
171	adde	r0,r0,r6
172#endif
173
174.Lcsum_finish:
175	addze	r0,r0			/* add in final carry */
176	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177	add	r3,r4,r0
178	srdi	r3,r3,32
179	blr
180EXPORT_SYMBOL(__csum_partial)
181
182
183	.macro srcnr
184100:
185	EX_TABLE(100b,.Lsrc_error_nr)
186	.endm
187
188	.macro source
189150:
190	EX_TABLE(150b,.Lsrc_error)
191	.endm
192
193	.macro dstnr
194200:
195	EX_TABLE(200b,.Ldest_error_nr)
196	.endm
197
198	.macro dest
199250:
200	EX_TABLE(250b,.Ldest_error)
201	.endm
202
203/*
204 * Computes the checksum of a memory block at src, length len,
205 * and adds in "sum" (32-bit), while copying the block to dst.
206 * If an access exception occurs on src or dst, it stores -EFAULT
207 * to *src_err or *dst_err respectively. The caller must take any action
208 * required in this case (zeroing memory, recalculating partial checksum etc).
209 *
210 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
211 */
212_GLOBAL(csum_partial_copy_generic)
213	addic	r0,r6,0			/* clear carry */
214
215	srdi.	r6,r5,3			/* less than 8 bytes? */
216	beq	.Lcopy_tail_word
217
218	/*
219	 * If only halfword aligned, align to a double word. Since odd
220	 * aligned addresses should be rare and they would require more
221	 * work to calculate the correct checksum, we ignore that case
222	 * and take the potential slowdown of unaligned loads.
223	 *
224	 * If the source and destination are relatively unaligned we only
225	 * align the source. This keeps things simple.
226	 */
227	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
228	beq	.Lcopy_aligned
229
230	li	r9,4
231	sub	r6,r9,r6
232	mtctr	r6
233
2341:
235srcnr;	lhz	r6,0(r3)		/* align to doubleword */
236	subi	r5,r5,2
237	addi	r3,r3,2
238	adde	r0,r0,r6
239dstnr;	sth	r6,0(r4)
240	addi	r4,r4,2
241	bdnz	1b
242
243.Lcopy_aligned:
244	/*
245	 * We unroll the loop such that each iteration is 64 bytes with an
246	 * entry and exit limb of 64 bytes, meaning a minimum size of
247	 * 128 bytes.
248	 */
249	srdi.	r6,r5,7
250	beq	.Lcopy_tail_doublewords		/* len < 128 */
251
252	srdi	r6,r5,6
253	subi	r6,r6,1
254	mtctr	r6
255
256	stdu	r1,-STACKFRAMESIZE(r1)
257	std	r14,STK_REG(R14)(r1)
258	std	r15,STK_REG(R15)(r1)
259	std	r16,STK_REG(R16)(r1)
260
261source;	ld	r6,0(r3)
262source;	ld	r9,8(r3)
263
264source;	ld	r10,16(r3)
265source;	ld	r11,24(r3)
266
267	/*
268	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
269	 * because of the XER dependency. This means the fastest this loop can
270	 * go is 16 cycles per iteration. The scheduling of the loop below has
271	 * been shown to hit this on both POWER6 and POWER7.
272	 */
273	.align 5
2742:
275	adde	r0,r0,r6
276source;	ld	r12,32(r3)
277source;	ld	r14,40(r3)
278
279	adde	r0,r0,r9
280source;	ld	r15,48(r3)
281source;	ld	r16,56(r3)
282	addi	r3,r3,64
283
284	adde	r0,r0,r10
285dest;	std	r6,0(r4)
286dest;	std	r9,8(r4)
287
288	adde	r0,r0,r11
289dest;	std	r10,16(r4)
290dest;	std	r11,24(r4)
291
292	adde	r0,r0,r12
293dest;	std	r12,32(r4)
294dest;	std	r14,40(r4)
295
296	adde	r0,r0,r14
297dest;	std	r15,48(r4)
298dest;	std	r16,56(r4)
299	addi	r4,r4,64
300
301	adde	r0,r0,r15
302source;	ld	r6,0(r3)
303source;	ld	r9,8(r3)
304
305	adde	r0,r0,r16
306source;	ld	r10,16(r3)
307source;	ld	r11,24(r3)
308	bdnz	2b
309
310
311	adde	r0,r0,r6
312source;	ld	r12,32(r3)
313source;	ld	r14,40(r3)
314
315	adde	r0,r0,r9
316source;	ld	r15,48(r3)
317source;	ld	r16,56(r3)
318	addi	r3,r3,64
319
320	adde	r0,r0,r10
321dest;	std	r6,0(r4)
322dest;	std	r9,8(r4)
323
324	adde	r0,r0,r11
325dest;	std	r10,16(r4)
326dest;	std	r11,24(r4)
327
328	adde	r0,r0,r12
329dest;	std	r12,32(r4)
330dest;	std	r14,40(r4)
331
332	adde	r0,r0,r14
333dest;	std	r15,48(r4)
334dest;	std	r16,56(r4)
335	addi	r4,r4,64
336
337	adde	r0,r0,r15
338	adde	r0,r0,r16
339
340	ld	r14,STK_REG(R14)(r1)
341	ld	r15,STK_REG(R15)(r1)
342	ld	r16,STK_REG(R16)(r1)
343	addi	r1,r1,STACKFRAMESIZE
344
345	andi.	r5,r5,63
346
347.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
348	srdi.	r6,r5,3
349	beq	.Lcopy_tail_word
350
351	mtctr	r6
3523:
353srcnr;	ld	r6,0(r3)
354	addi	r3,r3,8
355	adde	r0,r0,r6
356dstnr;	std	r6,0(r4)
357	addi	r4,r4,8
358	bdnz	3b
359
360	andi.	r5,r5,7
361
362.Lcopy_tail_word:			/* Up to 7 bytes to go */
363	srdi.	r6,r5,2
364	beq	.Lcopy_tail_halfword
365
366srcnr;	lwz	r6,0(r3)
367	addi	r3,r3,4
368	adde	r0,r0,r6
369dstnr;	stw	r6,0(r4)
370	addi	r4,r4,4
371	subi	r5,r5,4
372
373.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
374	srdi.	r6,r5,1
375	beq	.Lcopy_tail_byte
376
377srcnr;	lhz	r6,0(r3)
378	addi	r3,r3,2
379	adde	r0,r0,r6
380dstnr;	sth	r6,0(r4)
381	addi	r4,r4,2
382	subi	r5,r5,2
383
384.Lcopy_tail_byte:			/* Up to 1 byte to go */
385	andi.	r6,r5,1
386	beq	.Lcopy_finish
387
388srcnr;	lbz	r6,0(r3)
389#ifdef __BIG_ENDIAN__
390	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
391	adde	r0,r0,r9
392#else
393	adde	r0,r0,r6
394#endif
395dstnr;	stb	r6,0(r4)
396
397.Lcopy_finish:
398	addze	r0,r0			/* add in final carry */
399	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
400	add	r3,r4,r0
401	srdi	r3,r3,32
402	blr
403
404.Lsrc_error:
405	ld	r14,STK_REG(R14)(r1)
406	ld	r15,STK_REG(R15)(r1)
407	ld	r16,STK_REG(R16)(r1)
408	addi	r1,r1,STACKFRAMESIZE
409.Lsrc_error_nr:
410	cmpdi	0,r7,0
411	beqlr
412	li	r6,-EFAULT
413	stw	r6,0(r7)
414	blr
415
416.Ldest_error:
417	ld	r14,STK_REG(R14)(r1)
418	ld	r15,STK_REG(R15)(r1)
419	ld	r16,STK_REG(R16)(r1)
420	addi	r1,r1,STACKFRAMESIZE
421.Ldest_error_nr:
422	cmpdi	0,r8,0
423	beqlr
424	li	r6,-EFAULT
425	stw	r6,0(r8)
426	blr
427EXPORT_SYMBOL(csum_partial_copy_generic)
428
429/*
430 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
431 *			   const struct in6_addr *daddr,
432 *			   __u32 len, __u8 proto, __wsum sum)
433 */
434
435_GLOBAL(csum_ipv6_magic)
436	ld	r8, 0(r3)
437	ld	r9, 8(r3)
438	add	r5, r5, r6
439	addc	r0, r8, r9
440	ld	r10, 0(r4)
441	ld	r11, 8(r4)
442#ifdef CONFIG_CPU_LITTLE_ENDIAN
443	rotldi	r5, r5, 8
444#endif
445	adde	r0, r0, r10
446	add	r5, r5, r7
447	adde	r0, r0, r11
448	adde	r0, r0, r5
449	addze	r0, r0
450	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
451	add	r3, r0, r3
452	srdi	r0, r3, 32
453	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
454	add	r3, r0, r3
455	not	r3, r3
456	rlwinm	r3, r3, 16, 16, 31
457	blr
458EXPORT_SYMBOL(csum_ipv6_magic)
459