xref: /openbmc/linux/arch/xtensa/lib/checksum.S (revision c819e2cf)
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9 *                  Optimized by Joe Taylor
10 *
11 *		This program is free software; you can redistribute it and/or
12 *		modify it under the terms of the GNU General Public License
13 *		as published by the Free Software Foundation; either version
14 *		2 of the License, or (at your option) any later version.
15 */
16
17#include <asm/errno.h>
18#include <linux/linkage.h>
19#include <variant/core.h>
20
21/*
22 * computes a partial checksum, e.g. for TCP/UDP fragments
23 */
24
25/*
26 * unsigned int csum_partial(const unsigned char *buf, int len,
27 *                           unsigned int sum);
28 *    a2 = buf
29 *    a3 = len
30 *    a4 = sum
31 *
32 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
33 */
34
35/* ONES_ADD converts twos-complement math to ones-complement. */
36#define ONES_ADD(sum, val)	  \
37	add	sum, sum, val	; \
38	bgeu	sum, val, 99f	; \
39	addi	sum, sum, 1	; \
4099:				;
41
42.text
43ENTRY(csum_partial)
44
45	/*
46	 * Experiments with Ethernet and SLIP connections show that buf
47	 * is aligned on either a 2-byte or 4-byte boundary.
48	 */
49	entry	sp, 32
50	extui	a5, a2, 0, 2
51	bnez	a5, 8f		/* branch if 2-byte aligned */
52	/* Fall-through on common case, 4-byte alignment */
531:
54	srli	a5, a3, 5	/* 32-byte chunks */
55#if XCHAL_HAVE_LOOPS
56	loopgtz	a5, 2f
57#else
58	beqz	a5, 2f
59	slli	a5, a5, 5
60	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
61.Loop1:
62#endif
63	l32i	a6, a2, 0
64	l32i	a7, a2, 4
65	ONES_ADD(a4, a6)
66	ONES_ADD(a4, a7)
67	l32i	a6, a2, 8
68	l32i	a7, a2, 12
69	ONES_ADD(a4, a6)
70	ONES_ADD(a4, a7)
71	l32i	a6, a2, 16
72	l32i	a7, a2, 20
73	ONES_ADD(a4, a6)
74	ONES_ADD(a4, a7)
75	l32i	a6, a2, 24
76	l32i	a7, a2, 28
77	ONES_ADD(a4, a6)
78	ONES_ADD(a4, a7)
79	addi	a2, a2, 4*8
80#if !XCHAL_HAVE_LOOPS
81	blt	a2, a5, .Loop1
82#endif
832:
84	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
85#if XCHAL_HAVE_LOOPS
86	loopgtz	a5, 3f
87#else
88	beqz	a5, 3f
89	slli	a5, a5, 2
90	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
91.Loop2:
92#endif
93	l32i	a6, a2, 0
94	ONES_ADD(a4, a6)
95	addi	a2, a2, 4
96#if !XCHAL_HAVE_LOOPS
97	blt	a2, a5, .Loop2
98#endif
993:
100	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
101	l16ui	a6, a2, 0
102	ONES_ADD(a4, a6)
103	addi	a2, a2, 2
1045:
105	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
1066:	l8ui	a6, a2, 0
107#ifdef __XTENSA_EB__
108	slli	a6, a6, 8	/* load byte into bits 8..15 */
109#endif
110	ONES_ADD(a4, a6)
1117:
112	mov	a2, a4
113	retw
114
115	/* uncommon case, buf is 2-byte aligned */
1168:
117	beqz	a3, 7b		/* branch if len == 0 */
118	beqi	a3, 1, 6b	/* branch if len == 1 */
119
120	extui	a5, a2, 0, 1
121	bnez	a5, 8f		/* branch if 1-byte aligned */
122
123	l16ui	a6, a2, 0	/* common case, len >= 2 */
124	ONES_ADD(a4, a6)
125	addi	a2, a2, 2	/* adjust buf */
126	addi	a3, a3, -2	/* adjust len */
127	j	1b		/* now buf is 4-byte aligned */
128
129	/* case: odd-byte aligned, len > 1
130	 * This case is dog slow, so don't give us an odd address.
131	 * (I don't think this ever happens, but just in case.)
132	 */
1338:
134	srli	a5, a3, 2	/* 4-byte chunks */
135#if XCHAL_HAVE_LOOPS
136	loopgtz	a5, 2f
137#else
138	beqz	a5, 2f
139	slli	a5, a5, 2
140	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
141.Loop3:
142#endif
143	l8ui	a6, a2, 0	/* bits 24..31 */
144	l16ui	a7, a2, 1	/* bits  8..23 */
145	l8ui	a8, a2, 3	/* bits  0.. 8 */
146#ifdef	__XTENSA_EB__
147	slli	a6, a6, 24
148#else
149	slli	a8, a8, 24
150#endif
151	slli	a7, a7, 8
152	or	a7, a7, a6
153	or	a7, a7, a8
154	ONES_ADD(a4, a7)
155	addi	a2, a2, 4
156#if !XCHAL_HAVE_LOOPS
157	blt	a2, a5, .Loop3
158#endif
1592:
160	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
161	l8ui	a6, a2, 0
162	l8ui	a7, a2, 1
163#ifdef	__XTENSA_EB__
164	slli	a6, a6, 8
165#else
166	slli	a7, a7, 8
167#endif
168	or	a7, a7, a6
169	ONES_ADD(a4, a7)
170	addi	a2, a2, 2
1713:
172	j	5b		/* branch to handle the remaining byte */
173
174ENDPROC(csum_partial)
175
176/*
177 * Copy from ds while checksumming, otherwise like csum_partial
178 *
179 * The macros SRC and DST specify the type of access for the instruction.
180 * thus we can call a custom exception handler for each access type.
181 */
182
183#define SRC(y...)			\
184	9999: y;			\
185	.section __ex_table, "a";	\
186	.long 9999b, 6001f	;	\
187	.previous
188
189#define DST(y...)			\
190	9999: y;			\
191	.section __ex_table, "a";	\
192	.long 9999b, 6002f	;	\
193	.previous
194
195/*
196unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
197					int sum, int *src_err_ptr, int *dst_err_ptr)
198	a2  = src
199	a3  = dst
200	a4  = len
201	a5  = sum
202	a6  = src_err_ptr
203	a7  = dst_err_ptr
204	a8  = temp
205	a9  = temp
206	a10 = temp
207	a11 = original len for exception handling
208	a12 = original dst for exception handling
209
210    This function is optimized for 4-byte aligned addresses.  Other
211    alignments work, but not nearly as efficiently.
212 */
213
214ENTRY(csum_partial_copy_generic)
215
216	entry	sp, 32
217	mov	a12, a3
218	mov	a11, a4
219	or	a10, a2, a3
220
221	/* We optimize the following alignment tests for the 4-byte
222	aligned case.  Two bbsi.l instructions might seem more optimal
223	(commented out below).  However, both labels 5: and 3: are out
224	of the imm8 range, so the assembler relaxes them into
225	equivalent bbci.l, j combinations, which is actually
226	slower. */
227
228	extui	a9, a10, 0, 2
229	beqz	a9, 1f		/* branch if both are 4-byte aligned */
230	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
231	j	3f		/* one address is 2-byte aligned */
232
233/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
234/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
235
2361:
237	/* src and dst are both 4-byte aligned */
238	srli	a10, a4, 5	/* 32-byte chunks */
239#if XCHAL_HAVE_LOOPS
240	loopgtz	a10, 2f
241#else
242	beqz	a10, 2f
243	slli	a10, a10, 5
244	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
245.Loop5:
246#endif
247SRC(	l32i	a9, a2, 0	)
248SRC(	l32i	a8, a2, 4	)
249DST(	s32i	a9, a3, 0	)
250DST(	s32i	a8, a3, 4	)
251	ONES_ADD(a5, a9)
252	ONES_ADD(a5, a8)
253SRC(	l32i	a9, a2, 8	)
254SRC(	l32i	a8, a2, 12	)
255DST(	s32i	a9, a3, 8	)
256DST(	s32i	a8, a3, 12	)
257	ONES_ADD(a5, a9)
258	ONES_ADD(a5, a8)
259SRC(	l32i	a9, a2, 16	)
260SRC(	l32i	a8, a2, 20	)
261DST(	s32i	a9, a3, 16	)
262DST(	s32i	a8, a3, 20	)
263	ONES_ADD(a5, a9)
264	ONES_ADD(a5, a8)
265SRC(	l32i	a9, a2, 24	)
266SRC(	l32i	a8, a2, 28	)
267DST(	s32i	a9, a3, 24	)
268DST(	s32i	a8, a3, 28	)
269	ONES_ADD(a5, a9)
270	ONES_ADD(a5, a8)
271	addi	a2, a2, 32
272	addi	a3, a3, 32
273#if !XCHAL_HAVE_LOOPS
274	blt	a2, a10, .Loop5
275#endif
2762:
277	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
278	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
279#if XCHAL_HAVE_LOOPS
280	loopgtz	a10, 3f
281#else
282	beqz	a10, 3f
283	slli	a10, a10, 2
284	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
285.Loop6:
286#endif
287SRC(	l32i	a9, a2, 0	)
288DST(	s32i	a9, a3, 0	)
289	ONES_ADD(a5, a9)
290	addi	a2, a2, 4
291	addi	a3, a3, 4
292#if !XCHAL_HAVE_LOOPS
293	blt	a2, a10, .Loop6
294#endif
2953:
296	/*
297	Control comes to here in two cases: (1) It may fall through
298	to here from the 4-byte alignment case to process, at most,
299	one 2-byte chunk.  (2) It branches to here from above if
300	either src or dst is 2-byte aligned, and we process all bytes
301	here, except for perhaps a trailing odd byte.  It's
302	inefficient, so align your addresses to 4-byte boundaries.
303
304	a2 = src
305	a3 = dst
306	a4 = len
307	a5 = sum
308	*/
309	srli	a10, a4, 1	/* 2-byte chunks */
310#if XCHAL_HAVE_LOOPS
311	loopgtz	a10, 4f
312#else
313	beqz	a10, 4f
314	slli	a10, a10, 1
315	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
316.Loop7:
317#endif
318SRC(	l16ui	a9, a2, 0	)
319DST(	s16i	a9, a3, 0	)
320	ONES_ADD(a5, a9)
321	addi	a2, a2, 2
322	addi	a3, a3, 2
323#if !XCHAL_HAVE_LOOPS
324	blt	a2, a10, .Loop7
325#endif
3264:
327	/* This section processes a possible trailing odd byte. */
328	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
329SRC(	l8ui	a9, a2, 0	)
330DST(	s8i	a9, a3, 0	)
331#ifdef __XTENSA_EB__
332	slli	a9, a9, 8	/* shift byte to bits 8..15 */
333#endif
334	ONES_ADD(a5, a9)
3358:
336	mov	a2, a5
337	retw
338
3395:
340	/* Control branch to here when either src or dst is odd.  We
341	process all bytes using 8-bit accesses.  Grossly inefficient,
342	so don't feed us an odd address. */
343
344	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
345#if XCHAL_HAVE_LOOPS
346	loopgtz	a10, 6f
347#else
348	beqz	a10, 6f
349	slli	a10, a10, 1
350	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
351.Loop8:
352#endif
353SRC(	l8ui	a9, a2, 0	)
354SRC(	l8ui	a8, a2, 1	)
355DST(	s8i	a9, a3, 0	)
356DST(	s8i	a8, a3, 1	)
357#ifdef __XTENSA_EB__
358	slli	a9, a9, 8	/* combine into a single 16-bit value */
359#else				/* for checksum computation */
360	slli	a8, a8, 8
361#endif
362	or	a9, a9, a8
363	ONES_ADD(a5, a9)
364	addi	a2, a2, 2
365	addi	a3, a3, 2
366#if !XCHAL_HAVE_LOOPS
367	blt	a2, a10, .Loop8
368#endif
3696:
370	j	4b		/* process the possible trailing odd byte */
371
372ENDPROC(csum_partial_copy_generic)
373
374
375# Exception handler:
376.section .fixup, "ax"
377/*
378	a6  = src_err_ptr
379	a7  = dst_err_ptr
380	a11 = original len for exception handling
381	a12 = original dst for exception handling
382*/
383
3846001:
385	_movi	a2, -EFAULT
386	s32i	a2, a6, 0	/* src_err_ptr */
387
388	# clear the complete destination - computing the rest
389	# is too much work
390	movi	a2, 0
391#if XCHAL_HAVE_LOOPS
392	loopgtz	a11, 2f
393#else
394	beqz	a11, 2f
395	add	a11, a11, a12	/* a11 = ending address */
396.Leloop:
397#endif
398	s8i	a2, a12, 0
399	addi	a12, a12, 1
400#if !XCHAL_HAVE_LOOPS
401	blt	a12, a11, .Leloop
402#endif
4032:
404	retw
405
4066002:
407	movi	a2, -EFAULT
408	s32i	a2, a7, 0	/* dst_err_ptr */
409	movi	a2, 0
410	retw
411
412.previous
413