xref: /openbmc/linux/arch/xtensa/lib/checksum.S (revision d1538c46)
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9 *                  Optimized by Joe Taylor
10 *
11 *		This program is free software; you can redistribute it and/or
12 *		modify it under the terms of the GNU General Public License
13 *		as published by the Free Software Foundation; either version
14 *		2 of the License, or (at your option) any later version.
15 */
16
17#include <asm/errno.h>
18#include <linux/linkage.h>
19#include <variant/core.h>
20
21/*
22 * computes a partial checksum, e.g. for TCP/UDP fragments
23 */
24
25/*
26 * unsigned int csum_partial(const unsigned char *buf, int len,
27 *                           unsigned int sum);
28 *    a2 = buf
29 *    a3 = len
30 *    a4 = sum
31 *
32 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
33 */
34
35/* ONES_ADD converts twos-complement math to ones-complement. */
36#define ONES_ADD(sum, val)	  \
37	add	sum, sum, val	; \
38	bgeu	sum, val, 99f	; \
39	addi	sum, sum, 1	; \
4099:				;
41
42.text
43ENTRY(csum_partial)
44	  /*
45	   * Experiments with Ethernet and SLIP connections show that buf
46	   * is aligned on either a 2-byte or 4-byte boundary.
47	   */
48	entry	sp, 32
49	extui	a5, a2, 0, 2
50	bnez	a5, 8f		/* branch if 2-byte aligned */
51	/* Fall-through on common case, 4-byte alignment */
521:
53	srli	a5, a3, 5	/* 32-byte chunks */
54#if XCHAL_HAVE_LOOPS
55	loopgtz	a5, 2f
56#else
57	beqz	a5, 2f
58	slli	a5, a5, 5
59	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
60.Loop1:
61#endif
62	l32i	a6, a2, 0
63	l32i	a7, a2, 4
64	ONES_ADD(a4, a6)
65	ONES_ADD(a4, a7)
66	l32i	a6, a2, 8
67	l32i	a7, a2, 12
68	ONES_ADD(a4, a6)
69	ONES_ADD(a4, a7)
70	l32i	a6, a2, 16
71	l32i	a7, a2, 20
72	ONES_ADD(a4, a6)
73	ONES_ADD(a4, a7)
74	l32i	a6, a2, 24
75	l32i	a7, a2, 28
76	ONES_ADD(a4, a6)
77	ONES_ADD(a4, a7)
78	addi	a2, a2, 4*8
79#if !XCHAL_HAVE_LOOPS
80	blt	a2, a5, .Loop1
81#endif
822:
83	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
84#if XCHAL_HAVE_LOOPS
85	loopgtz	a5, 3f
86#else
87	beqz	a5, 3f
88	slli	a5, a5, 2
89	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
90.Loop2:
91#endif
92	l32i	a6, a2, 0
93	ONES_ADD(a4, a6)
94	addi	a2, a2, 4
95#if !XCHAL_HAVE_LOOPS
96	blt	a2, a5, .Loop2
97#endif
983:
99	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
100	l16ui	a6, a2, 0
101	ONES_ADD(a4, a6)
102	addi	a2, a2, 2
1035:
104	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
1056:	l8ui	a6, a2, 0
106#ifdef __XTENSA_EB__
107	slli	a6, a6, 8	/* load byte into bits 8..15 */
108#endif
109	ONES_ADD(a4, a6)
1107:
111	mov	a2, a4
112	retw
113
114	/* uncommon case, buf is 2-byte aligned */
1158:
116	beqz	a3, 7b		/* branch if len == 0 */
117	beqi	a3, 1, 6b	/* branch if len == 1 */
118
119	extui	a5, a2, 0, 1
120	bnez	a5, 8f		/* branch if 1-byte aligned */
121
122	l16ui	a6, a2, 0	/* common case, len >= 2 */
123	ONES_ADD(a4, a6)
124	addi	a2, a2, 2	/* adjust buf */
125	addi	a3, a3, -2	/* adjust len */
126	j	1b		/* now buf is 4-byte aligned */
127
128	/* case: odd-byte aligned, len > 1
129	 * This case is dog slow, so don't give us an odd address.
130	 * (I don't think this ever happens, but just in case.)
131	 */
1328:
133	srli	a5, a3, 2	/* 4-byte chunks */
134#if XCHAL_HAVE_LOOPS
135	loopgtz	a5, 2f
136#else
137	beqz	a5, 2f
138	slli	a5, a5, 2
139	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
140.Loop3:
141#endif
142	l8ui	a6, a2, 0	/* bits 24..31 */
143	l16ui	a7, a2, 1	/* bits  8..23 */
144	l8ui	a8, a2, 3	/* bits  0.. 8 */
145#ifdef	__XTENSA_EB__
146	slli	a6, a6, 24
147#else
148	slli	a8, a8, 24
149#endif
150	slli	a7, a7, 8
151	or	a7, a7, a6
152	or	a7, a7, a8
153	ONES_ADD(a4, a7)
154	addi	a2, a2, 4
155#if !XCHAL_HAVE_LOOPS
156	blt	a2, a5, .Loop3
157#endif
1582:
159	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
160	l8ui	a6, a2, 0
161	l8ui	a7, a2, 1
162#ifdef	__XTENSA_EB__
163	slli	a6, a6, 8
164#else
165	slli	a7, a7, 8
166#endif
167	or	a7, a7, a6
168	ONES_ADD(a4, a7)
169	addi	a2, a2, 2
1703:
171	j	5b		/* branch to handle the remaining byte */
172
173ENDPROC(csum_partial)
174
175/*
176 * Copy from ds while checksumming, otherwise like csum_partial
177 *
178 * The macros SRC and DST specify the type of access for the instruction.
179 * thus we can call a custom exception handler for each access type.
180 */
181
182#define SRC(y...)			\
183	9999: y;			\
184	.section __ex_table, "a";	\
185	.long 9999b, 6001f	;	\
186	.previous
187
188#define DST(y...)			\
189	9999: y;			\
190	.section __ex_table, "a";	\
191	.long 9999b, 6002f	;	\
192	.previous
193
194/*
195unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
196					int sum, int *src_err_ptr, int *dst_err_ptr)
197	a2  = src
198	a3  = dst
199	a4  = len
200	a5  = sum
201	a6  = src_err_ptr
202	a7  = dst_err_ptr
203	a8  = temp
204	a9  = temp
205	a10 = temp
206	a11 = original len for exception handling
207	a12 = original dst for exception handling
208
209    This function is optimized for 4-byte aligned addresses.  Other
210    alignments work, but not nearly as efficiently.
211 */
212
213ENTRY(csum_partial_copy_generic)
214
215	entry	sp, 32
216	mov	a12, a3
217	mov	a11, a4
218	or	a10, a2, a3
219
220	/* We optimize the following alignment tests for the 4-byte
221	aligned case.  Two bbsi.l instructions might seem more optimal
222	(commented out below).  However, both labels 5: and 3: are out
223	of the imm8 range, so the assembler relaxes them into
224	equivalent bbci.l, j combinations, which is actually
225	slower. */
226
227	extui	a9, a10, 0, 2
228	beqz	a9, 1f		/* branch if both are 4-byte aligned */
229	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
230	j	3f		/* one address is 2-byte aligned */
231
232/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
233/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
234
2351:
236	/* src and dst are both 4-byte aligned */
237	srli	a10, a4, 5	/* 32-byte chunks */
238#if XCHAL_HAVE_LOOPS
239	loopgtz	a10, 2f
240#else
241	beqz	a10, 2f
242	slli	a10, a10, 5
243	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
244.Loop5:
245#endif
246SRC(	l32i	a9, a2, 0	)
247SRC(	l32i	a8, a2, 4	)
248DST(	s32i	a9, a3, 0	)
249DST(	s32i	a8, a3, 4	)
250	ONES_ADD(a5, a9)
251	ONES_ADD(a5, a8)
252SRC(	l32i	a9, a2, 8	)
253SRC(	l32i	a8, a2, 12	)
254DST(	s32i	a9, a3, 8	)
255DST(	s32i	a8, a3, 12	)
256	ONES_ADD(a5, a9)
257	ONES_ADD(a5, a8)
258SRC(	l32i	a9, a2, 16	)
259SRC(	l32i	a8, a2, 20	)
260DST(	s32i	a9, a3, 16	)
261DST(	s32i	a8, a3, 20	)
262	ONES_ADD(a5, a9)
263	ONES_ADD(a5, a8)
264SRC(	l32i	a9, a2, 24	)
265SRC(	l32i	a8, a2, 28	)
266DST(	s32i	a9, a3, 24	)
267DST(	s32i	a8, a3, 28	)
268	ONES_ADD(a5, a9)
269	ONES_ADD(a5, a8)
270	addi	a2, a2, 32
271	addi	a3, a3, 32
272#if !XCHAL_HAVE_LOOPS
273	blt	a2, a10, .Loop5
274#endif
2752:
276	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
277	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
278#if XCHAL_HAVE_LOOPS
279	loopgtz	a10, 3f
280#else
281	beqz	a10, 3f
282	slli	a10, a10, 2
283	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
284.Loop6:
285#endif
286SRC(	l32i	a9, a2, 0	)
287DST(	s32i	a9, a3, 0	)
288	ONES_ADD(a5, a9)
289	addi	a2, a2, 4
290	addi	a3, a3, 4
291#if !XCHAL_HAVE_LOOPS
292	blt	a2, a10, .Loop6
293#endif
2943:
295	/*
296	Control comes to here in two cases: (1) It may fall through
297	to here from the 4-byte alignment case to process, at most,
298	one 2-byte chunk.  (2) It branches to here from above if
299	either src or dst is 2-byte aligned, and we process all bytes
300	here, except for perhaps a trailing odd byte.  It's
301	inefficient, so align your addresses to 4-byte boundaries.
302
303	a2 = src
304	a3 = dst
305	a4 = len
306	a5 = sum
307	*/
308	srli	a10, a4, 1	/* 2-byte chunks */
309#if XCHAL_HAVE_LOOPS
310	loopgtz	a10, 4f
311#else
312	beqz	a10, 4f
313	slli	a10, a10, 1
314	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
315.Loop7:
316#endif
317SRC(	l16ui	a9, a2, 0	)
318DST(	s16i	a9, a3, 0	)
319	ONES_ADD(a5, a9)
320	addi	a2, a2, 2
321	addi	a3, a3, 2
322#if !XCHAL_HAVE_LOOPS
323	blt	a2, a10, .Loop7
324#endif
3254:
326	/* This section processes a possible trailing odd byte. */
327	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
328SRC(	l8ui	a9, a2, 0	)
329DST(	s8i	a9, a3, 0	)
330#ifdef __XTENSA_EB__
331	slli	a9, a9, 8	/* shift byte to bits 8..15 */
332#endif
333	ONES_ADD(a5, a9)
3348:
335	mov	a2, a5
336	retw
337
3385:
339	/* Control branch to here when either src or dst is odd.  We
340	process all bytes using 8-bit accesses.  Grossly inefficient,
341	so don't feed us an odd address. */
342
343	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
344#if XCHAL_HAVE_LOOPS
345	loopgtz	a10, 6f
346#else
347	beqz	a10, 6f
348	slli	a10, a10, 1
349	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
350.Loop8:
351#endif
352SRC(	l8ui	a9, a2, 0	)
353SRC(	l8ui	a8, a2, 1	)
354DST(	s8i	a9, a3, 0	)
355DST(	s8i	a8, a3, 1	)
356#ifdef __XTENSA_EB__
357	slli	a9, a9, 8	/* combine into a single 16-bit value */
358#else				/* for checksum computation */
359	slli	a8, a8, 8
360#endif
361	or	a9, a9, a8
362	ONES_ADD(a5, a9)
363	addi	a2, a2, 2
364	addi	a3, a3, 2
365#if !XCHAL_HAVE_LOOPS
366	blt	a2, a10, .Loop8
367#endif
3686:
369	j	4b		/* process the possible trailing odd byte */
370
371ENDPROC(csum_partial_copy_generic)
372
373
374# Exception handler:
375.section .fixup, "ax"
376/*
377	a6  = src_err_ptr
378	a7  = dst_err_ptr
379	a11 = original len for exception handling
380	a12 = original dst for exception handling
381*/
382
3836001:
384	_movi	a2, -EFAULT
385	s32i	a2, a6, 0	/* src_err_ptr */
386
387	# clear the complete destination - computing the rest
388	# is too much work
389	movi	a2, 0
390#if XCHAL_HAVE_LOOPS
391	loopgtz	a11, 2f
392#else
393	beqz	a11, 2f
394	add	a11, a11, a12	/* a11 = ending address */
395.Leloop:
396#endif
397	s8i	a2, a12, 0
398	addi	a12, a12, 1
399#if !XCHAL_HAVE_LOOPS
400	blt	a12, a11, .Leloop
401#endif
4022:
403	retw
404
4056002:
406	movi	a2, -EFAULT
407	s32i	a2, a7, 0	/* dst_err_ptr */
408	movi	a2, 0
409	retw
410
411.previous
412
413