xref: /openbmc/linux/arch/ia64/lib/ip_fast_csum.S (revision c51d39010a1bccc9c1294e2d7c00005aefeb2b5c)
1/*
2 * Optmized version of the ip_fast_csum() function
3 * Used for calculating IP header checksum
4 *
5 * Return: 16bit checksum, complemented
6 *
7 * Inputs:
8 *      in0: address of buffer to checksum (char *)
9 *      in1: length of the buffer (int)
10 *
11 * Copyright (C) 2002, 2006 Intel Corp.
12 * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
13 */
14
15#include <asm/asmmacro.h>
16#include <asm/export.h>
17
18/*
19 * Since we know that most likely this function is called with buf aligned
20 * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
21 * versus calling generic version of do_csum, which has lots of overhead in
22 * handling various alignments and sizes.  However, due to lack of constrains
23 * put on the function input argument, cases with alignment not on 4-byte or
24 * size not equal to 20 bytes will be handled by the generic do_csum function.
25 */
26
27#define in0	r32
28#define in1	r33
29#define in2	r34
30#define in3	r35
31#define in4	r36
32#define ret0	r8
33
34GLOBAL_ENTRY(ip_fast_csum)
35	.prologue
36	.body
37	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
38	and	r14=3,in0	// is it aligned on 4-byte?
39	add	r15=4,in0	// second source pointer
40	;;
41	cmp.ne.or.andcm p6,p7=r14,r0
42	;;
43(p7)	ld4	r20=[in0],8
44(p7)	ld4	r21=[r15],8
45(p6)	br.spnt	.generic
46	;;
47	ld4	r22=[in0],8
48	ld4	r23=[r15],8
49	;;
50	ld4	r24=[in0]
51	add	r20=r20,r21
52	add	r22=r22,r23
53	;;
54	add	r20=r20,r22
55	;;
56	add	r20=r20,r24
57	;;
58	shr.u	ret0=r20,16	// now need to add the carry
59	zxt2	r20=r20
60	;;
61	add	r20=ret0,r20
62	;;
63	shr.u	ret0=r20,16	// add carry again
64	zxt2	r20=r20
65	;;
66	add	r20=ret0,r20
67	;;
68	shr.u	ret0=r20,16
69	zxt2	r20=r20
70	;;
71	add	r20=ret0,r20
72	mov	r9=0xffff
73	;;
74	andcm	ret0=r9,r20
75	.restore sp		// reset frame state
76	br.ret.sptk.many b0
77	;;
78
79.generic:
80	.prologue
81	.save ar.pfs, r35
82	alloc	r35=ar.pfs,2,2,2,0
83	.save rp, r34
84	mov	r34=b0
85	.body
86	dep.z	out1=in1,2,30
87	mov	out0=in0
88	;;
89	br.call.sptk.many b0=do_csum
90	;;
91	andcm	ret0=-1,ret0
92	mov	ar.pfs=r35
93	mov	b0=r34
94	br.ret.sptk.many b0
95END(ip_fast_csum)
96EXPORT_SYMBOL(ip_fast_csum)
97
98GLOBAL_ENTRY(csum_ipv6_magic)
99	ld4	r20=[in0],4
100	ld4	r21=[in1],4
101	zxt4	in2=in2
102	;;
103	ld4	r22=[in0],4
104	ld4	r23=[in1],4
105	dep	r15=in3,in2,32,16
106	;;
107	ld4	r24=[in0],4
108	ld4	r25=[in1],4
109	mux1	r15=r15,@rev
110	add	r16=r20,r21
111	add	r17=r22,r23
112	zxt4	in4=in4
113	;;
114	ld4	r26=[in0],4
115	ld4	r27=[in1],4
116	shr.u	r15=r15,16
117	add	r18=r24,r25
118	add	r8=r16,r17
119	;;
120	add	r19=r26,r27
121	add	r8=r8,r18
122	;;
123	add	r8=r8,r19
124	add	r15=r15,in4
125	;;
126	add	r8=r8,r15
127	;;
128	shr.u	r10=r8,32	// now fold sum into short
129	zxt4	r11=r8
130	;;
131	add	r8=r10,r11
132	;;
133	shr.u	r10=r8,16	// yeah, keep it rolling
134	zxt2	r11=r8
135	;;
136	add	r8=r10,r11
137	;;
138	shr.u	r10=r8,16	// three times lucky
139	zxt2	r11=r8
140	;;
141	add	r8=r10,r11
142	mov	r9=0xffff
143	;;
144	andcm	r8=r9,r8
145	br.ret.sptk.many b0
146END(csum_ipv6_magic)
147EXPORT_SYMBOL(csum_ipv6_magic)
148