xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *		IPv4 specific functions
11  *
12  *
13  *		code split from:
14  *		linux/ipv4/tcp.c
15  *		linux/ipv4/tcp_input.c
16  *		linux/ipv4/tcp_output.c
17  *
18  *		See tcp.c for author information
19  *
20  *	This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25 
26 /*
27  * Changes:
28  *		David S. Miller	:	New socket lookup architecture.
29  *					This code is dedicated to John Dyson.
30  *		David S. Miller :	Change semantics of established hash,
31  *					half is devoted to TIME_WAIT sockets
32  *					and the rest go in the other half.
33  *		Andi Kleen :		Add support for syncookies and fixed
34  *					some bugs: ip options weren't passed to
35  *					the TCP layer, missed a check for an
36  *					ACK bit.
37  *		Andi Kleen :		Implemented fast path mtu discovery.
38  *	     				Fixed many serious bugs in the
39  *					request_sock handling and moved
40  *					most of it into the af independent code.
41  *					Added tail drop and some other bugfixes.
42  *					Added new listen sematics.
43  *		Mike McLagan	:	Routing by source
44  *	Juan Jose Ciarlante:		ip_dynaddr bits
45  *		Andi Kleen:		various fixes.
46  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47  *					coma.
48  *	Andi Kleen		:	Fix new listen.
49  *	Andi Kleen		:	Fix accept error reporting.
50  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52  *					a single port at the same time.
53  */
54 
55 #include <linux/config.h>
56 
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65 
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/xfrm.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82 
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85 
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88 
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 		       struct sk_buff *skb);
91 
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 	.lhash_lock	= RW_LOCK_UNLOCKED,
94 	.lhash_users	= ATOMIC_INIT(0),
95 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
97 	.port_rover	= 1024 - 1,
98 };
99 
100 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
101 {
102 	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
103 }
104 
105 static void tcp_v4_hash(struct sock *sk)
106 {
107 	inet_hash(&tcp_hashinfo, sk);
108 }
109 
110 void tcp_unhash(struct sock *sk)
111 {
112 	inet_unhash(&tcp_hashinfo, sk);
113 }
114 
115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
116 {
117 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
118 					  skb->nh.iph->saddr,
119 					  skb->h.th->dest,
120 					  skb->h.th->source);
121 }
122 
123 /* called with local bh disabled */
124 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
125 				      struct inet_timewait_sock **twp)
126 {
127 	struct inet_sock *inet = inet_sk(sk);
128 	u32 daddr = inet->rcv_saddr;
129 	u32 saddr = inet->daddr;
130 	int dif = sk->sk_bound_dev_if;
131 	INET_ADDR_COOKIE(acookie, saddr, daddr)
132 	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
133 	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
134 	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
135 	struct sock *sk2;
136 	const struct hlist_node *node;
137 	struct inet_timewait_sock *tw;
138 
139 	prefetch(head->chain.first);
140 	write_lock(&head->lock);
141 
142 	/* Check TIME-WAIT sockets first. */
143 	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
144 		tw = inet_twsk(sk2);
145 
146 		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
147 			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
148 			struct tcp_sock *tp = tcp_sk(sk);
149 
150 			/* With PAWS, it is safe from the viewpoint
151 			   of data integrity. Even without PAWS it
152 			   is safe provided sequence spaces do not
153 			   overlap i.e. at data rates <= 80Mbit/sec.
154 
155 			   Actually, the idea is close to VJ's one,
156 			   only timestamp cache is held not per host,
157 			   but per port pair and TW bucket is used
158 			   as state holder.
159 
160 			   If TW bucket has been already destroyed we
161 			   fall back to VJ's scheme and use initial
162 			   timestamp retrieved from peer table.
163 			 */
164 			if (tcptw->tw_ts_recent_stamp &&
165 			    (!twp || (sysctl_tcp_tw_reuse &&
166 				      xtime.tv_sec -
167 				      tcptw->tw_ts_recent_stamp > 1))) {
168 				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
169 				if (tp->write_seq == 0)
170 					tp->write_seq = 1;
171 				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 				sock_hold(sk2);
174 				goto unique;
175 			} else
176 				goto not_unique;
177 		}
178 	}
179 	tw = NULL;
180 
181 	/* And established part... */
182 	sk_for_each(sk2, node, &head->chain) {
183 		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
184 			goto not_unique;
185 	}
186 
187 unique:
188 	/* Must record num and sport now. Otherwise we will see
189 	 * in hash table socket with a funny identity. */
190 	inet->num = lport;
191 	inet->sport = htons(lport);
192 	sk->sk_hash = hash;
193 	BUG_TRAP(sk_unhashed(sk));
194 	__sk_add_node(sk, &head->chain);
195 	sock_prot_inc_use(sk->sk_prot);
196 	write_unlock(&head->lock);
197 
198 	if (twp) {
199 		*twp = tw;
200 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
201 	} else if (tw) {
202 		/* Silly. Should hash-dance instead... */
203 		inet_twsk_deschedule(tw, &tcp_death_row);
204 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
205 
206 		inet_twsk_put(tw);
207 	}
208 
209 	return 0;
210 
211 not_unique:
212 	write_unlock(&head->lock);
213 	return -EADDRNOTAVAIL;
214 }
215 
216 static inline u32 connect_port_offset(const struct sock *sk)
217 {
218 	const struct inet_sock *inet = inet_sk(sk);
219 
220 	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
221 					 inet->dport);
222 }
223 
224 /*
225  * Bind a port for a connect operation and hash it.
226  */
227 static inline int tcp_v4_hash_connect(struct sock *sk)
228 {
229 	const unsigned short snum = inet_sk(sk)->num;
230  	struct inet_bind_hashbucket *head;
231  	struct inet_bind_bucket *tb;
232 	int ret;
233 
234  	if (!snum) {
235  		int low = sysctl_local_port_range[0];
236  		int high = sysctl_local_port_range[1];
237 		int range = high - low;
238  		int i;
239 		int port;
240 		static u32 hint;
241 		u32 offset = hint + connect_port_offset(sk);
242 		struct hlist_node *node;
243  		struct inet_timewait_sock *tw = NULL;
244 
245  		local_bh_disable();
246 		for (i = 1; i <= range; i++) {
247 			port = low + (i + offset) % range;
248  			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
249  			spin_lock(&head->lock);
250 
251  			/* Does not bother with rcv_saddr checks,
252  			 * because the established check is already
253  			 * unique enough.
254  			 */
255 			inet_bind_bucket_for_each(tb, node, &head->chain) {
256  				if (tb->port == port) {
257  					BUG_TRAP(!hlist_empty(&tb->owners));
258  					if (tb->fastreuse >= 0)
259  						goto next_port;
260  					if (!__tcp_v4_check_established(sk,
261 									port,
262 									&tw))
263  						goto ok;
264  					goto next_port;
265  				}
266  			}
267 
268  			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
269  			if (!tb) {
270  				spin_unlock(&head->lock);
271  				break;
272  			}
273  			tb->fastreuse = -1;
274  			goto ok;
275 
276  		next_port:
277  			spin_unlock(&head->lock);
278  		}
279  		local_bh_enable();
280 
281  		return -EADDRNOTAVAIL;
282 
283 ok:
284 		hint += i;
285 
286  		/* Head lock still held and bh's disabled */
287  		inet_bind_hash(sk, tb, port);
288 		if (sk_unhashed(sk)) {
289  			inet_sk(sk)->sport = htons(port);
290  			__inet_hash(&tcp_hashinfo, sk, 0);
291  		}
292  		spin_unlock(&head->lock);
293 
294  		if (tw) {
295  			inet_twsk_deschedule(tw, &tcp_death_row);;
296  			inet_twsk_put(tw);
297  		}
298 
299 		ret = 0;
300 		goto out;
301  	}
302 
303  	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
304  	tb  = inet_csk(sk)->icsk_bind_hash;
305 	spin_lock_bh(&head->lock);
306 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
307 		__inet_hash(&tcp_hashinfo, sk, 0);
308 		spin_unlock_bh(&head->lock);
309 		return 0;
310 	} else {
311 		spin_unlock(&head->lock);
312 		/* No definite answer... Walk to established hash table */
313 		ret = __tcp_v4_check_established(sk, snum, NULL);
314 out:
315 		local_bh_enable();
316 		return ret;
317 	}
318 }
319 
320 /* This will initiate an outgoing connection. */
321 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
322 {
323 	struct inet_sock *inet = inet_sk(sk);
324 	struct tcp_sock *tp = tcp_sk(sk);
325 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
326 	struct rtable *rt;
327 	u32 daddr, nexthop;
328 	int tmp;
329 	int err;
330 
331 	if (addr_len < sizeof(struct sockaddr_in))
332 		return -EINVAL;
333 
334 	if (usin->sin_family != AF_INET)
335 		return -EAFNOSUPPORT;
336 
337 	nexthop = daddr = usin->sin_addr.s_addr;
338 	if (inet->opt && inet->opt->srr) {
339 		if (!daddr)
340 			return -EINVAL;
341 		nexthop = inet->opt->faddr;
342 	}
343 
344 	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
345 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
346 			       IPPROTO_TCP,
347 			       inet->sport, usin->sin_port, sk);
348 	if (tmp < 0)
349 		return tmp;
350 
351 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
352 		ip_rt_put(rt);
353 		return -ENETUNREACH;
354 	}
355 
356 	if (!inet->opt || !inet->opt->srr)
357 		daddr = rt->rt_dst;
358 
359 	if (!inet->saddr)
360 		inet->saddr = rt->rt_src;
361 	inet->rcv_saddr = inet->saddr;
362 
363 	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
364 		/* Reset inherited state */
365 		tp->rx_opt.ts_recent	   = 0;
366 		tp->rx_opt.ts_recent_stamp = 0;
367 		tp->write_seq		   = 0;
368 	}
369 
370 	if (tcp_death_row.sysctl_tw_recycle &&
371 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
372 		struct inet_peer *peer = rt_get_peer(rt);
373 
374 		/* VJ's idea. We save last timestamp seen from
375 		 * the destination in peer table, when entering state TIME-WAIT
376 		 * and initialize rx_opt.ts_recent from it, when trying new connection.
377 		 */
378 
379 		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
380 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
381 			tp->rx_opt.ts_recent = peer->tcp_ts;
382 		}
383 	}
384 
385 	inet->dport = usin->sin_port;
386 	inet->daddr = daddr;
387 
388 	tp->ext_header_len = 0;
389 	if (inet->opt)
390 		tp->ext_header_len = inet->opt->optlen;
391 
392 	tp->rx_opt.mss_clamp = 536;
393 
394 	/* Socket identity is still unknown (sport may be zero).
395 	 * However we set state to SYN-SENT and not releasing socket
396 	 * lock select source port, enter ourselves into the hash tables and
397 	 * complete initialization after this.
398 	 */
399 	tcp_set_state(sk, TCP_SYN_SENT);
400 	err = tcp_v4_hash_connect(sk);
401 	if (err)
402 		goto failure;
403 
404 	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
405 	if (err)
406 		goto failure;
407 
408 	/* OK, now commit destination to socket.  */
409 	sk_setup_caps(sk, &rt->u.dst);
410 
411 	if (!tp->write_seq)
412 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
413 							   inet->daddr,
414 							   inet->sport,
415 							   usin->sin_port);
416 
417 	inet->id = tp->write_seq ^ jiffies;
418 
419 	err = tcp_connect(sk);
420 	rt = NULL;
421 	if (err)
422 		goto failure;
423 
424 	return 0;
425 
426 failure:
427 	/* This unhashes the socket and releases the local port, if necessary. */
428 	tcp_set_state(sk, TCP_CLOSE);
429 	ip_rt_put(rt);
430 	sk->sk_route_caps = 0;
431 	inet->dport = 0;
432 	return err;
433 }
434 
435 /*
436  * This routine does path mtu discovery as defined in RFC1191.
437  */
438 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
439 				     u32 mtu)
440 {
441 	struct dst_entry *dst;
442 	struct inet_sock *inet = inet_sk(sk);
443 	struct tcp_sock *tp = tcp_sk(sk);
444 
445 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
446 	 * send out by Linux are always <576bytes so they should go through
447 	 * unfragmented).
448 	 */
449 	if (sk->sk_state == TCP_LISTEN)
450 		return;
451 
452 	/* We don't check in the destentry if pmtu discovery is forbidden
453 	 * on this route. We just assume that no packet_to_big packets
454 	 * are send back when pmtu discovery is not active.
455      	 * There is a small race when the user changes this flag in the
456 	 * route, but I think that's acceptable.
457 	 */
458 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
459 		return;
460 
461 	dst->ops->update_pmtu(dst, mtu);
462 
463 	/* Something is about to be wrong... Remember soft error
464 	 * for the case, if this connection will not able to recover.
465 	 */
466 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
467 		sk->sk_err_soft = EMSGSIZE;
468 
469 	mtu = dst_mtu(dst);
470 
471 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
472 	    tp->pmtu_cookie > mtu) {
473 		tcp_sync_mss(sk, mtu);
474 
475 		/* Resend the TCP packet because it's
476 		 * clear that the old packet has been
477 		 * dropped. This is the new "fast" path mtu
478 		 * discovery.
479 		 */
480 		tcp_simple_retransmit(sk);
481 	} /* else let the usual retransmit timer handle it */
482 }
483 
484 /*
485  * This routine is called by the ICMP module when it gets some
486  * sort of error condition.  If err < 0 then the socket should
487  * be closed and the error returned to the user.  If err > 0
488  * it's just the icmp type << 8 | icmp code.  After adjustment
489  * header points to the first 8 bytes of the tcp header.  We need
490  * to find the appropriate port.
491  *
492  * The locking strategy used here is very "optimistic". When
493  * someone else accesses the socket the ICMP is just dropped
494  * and for some paths there is no check at all.
495  * A more general error queue to queue errors for later handling
496  * is probably better.
497  *
498  */
499 
500 void tcp_v4_err(struct sk_buff *skb, u32 info)
501 {
502 	struct iphdr *iph = (struct iphdr *)skb->data;
503 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
504 	struct tcp_sock *tp;
505 	struct inet_sock *inet;
506 	int type = skb->h.icmph->type;
507 	int code = skb->h.icmph->code;
508 	struct sock *sk;
509 	__u32 seq;
510 	int err;
511 
512 	if (skb->len < (iph->ihl << 2) + 8) {
513 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
514 		return;
515 	}
516 
517 	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
518 			 th->source, inet_iif(skb));
519 	if (!sk) {
520 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
521 		return;
522 	}
523 	if (sk->sk_state == TCP_TIME_WAIT) {
524 		inet_twsk_put((struct inet_timewait_sock *)sk);
525 		return;
526 	}
527 
528 	bh_lock_sock(sk);
529 	/* If too many ICMPs get dropped on busy
530 	 * servers this needs to be solved differently.
531 	 */
532 	if (sock_owned_by_user(sk))
533 		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
534 
535 	if (sk->sk_state == TCP_CLOSE)
536 		goto out;
537 
538 	tp = tcp_sk(sk);
539 	seq = ntohl(th->seq);
540 	if (sk->sk_state != TCP_LISTEN &&
541 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
542 		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
543 		goto out;
544 	}
545 
546 	switch (type) {
547 	case ICMP_SOURCE_QUENCH:
548 		/* Just silently ignore these. */
549 		goto out;
550 	case ICMP_PARAMETERPROB:
551 		err = EPROTO;
552 		break;
553 	case ICMP_DEST_UNREACH:
554 		if (code > NR_ICMP_UNREACH)
555 			goto out;
556 
557 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
558 			if (!sock_owned_by_user(sk))
559 				do_pmtu_discovery(sk, iph, info);
560 			goto out;
561 		}
562 
563 		err = icmp_err_convert[code].errno;
564 		break;
565 	case ICMP_TIME_EXCEEDED:
566 		err = EHOSTUNREACH;
567 		break;
568 	default:
569 		goto out;
570 	}
571 
572 	switch (sk->sk_state) {
573 		struct request_sock *req, **prev;
574 	case TCP_LISTEN:
575 		if (sock_owned_by_user(sk))
576 			goto out;
577 
578 		req = inet_csk_search_req(sk, &prev, th->dest,
579 					  iph->daddr, iph->saddr);
580 		if (!req)
581 			goto out;
582 
583 		/* ICMPs are not backlogged, hence we cannot get
584 		   an established socket here.
585 		 */
586 		BUG_TRAP(!req->sk);
587 
588 		if (seq != tcp_rsk(req)->snt_isn) {
589 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
590 			goto out;
591 		}
592 
593 		/*
594 		 * Still in SYN_RECV, just remove it silently.
595 		 * There is no good way to pass the error to the newly
596 		 * created socket, and POSIX does not want network
597 		 * errors returned from accept().
598 		 */
599 		inet_csk_reqsk_queue_drop(sk, req, prev);
600 		goto out;
601 
602 	case TCP_SYN_SENT:
603 	case TCP_SYN_RECV:  /* Cannot happen.
604 			       It can f.e. if SYNs crossed.
605 			     */
606 		if (!sock_owned_by_user(sk)) {
607 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
608 			sk->sk_err = err;
609 
610 			sk->sk_error_report(sk);
611 
612 			tcp_done(sk);
613 		} else {
614 			sk->sk_err_soft = err;
615 		}
616 		goto out;
617 	}
618 
619 	/* If we've already connected we will keep trying
620 	 * until we time out, or the user gives up.
621 	 *
622 	 * rfc1122 4.2.3.9 allows to consider as hard errors
623 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
624 	 * but it is obsoleted by pmtu discovery).
625 	 *
626 	 * Note, that in modern internet, where routing is unreliable
627 	 * and in each dark corner broken firewalls sit, sending random
628 	 * errors ordered by their masters even this two messages finally lose
629 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
630 	 *
631 	 * Now we are in compliance with RFCs.
632 	 *							--ANK (980905)
633 	 */
634 
635 	inet = inet_sk(sk);
636 	if (!sock_owned_by_user(sk) && inet->recverr) {
637 		sk->sk_err = err;
638 		sk->sk_error_report(sk);
639 	} else	{ /* Only an error on timeout */
640 		sk->sk_err_soft = err;
641 	}
642 
643 out:
644 	bh_unlock_sock(sk);
645 	sock_put(sk);
646 }
647 
648 /* This routine computes an IPv4 TCP checksum. */
649 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
650 		       struct sk_buff *skb)
651 {
652 	struct inet_sock *inet = inet_sk(sk);
653 
654 	if (skb->ip_summed == CHECKSUM_HW) {
655 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
656 		skb->csum = offsetof(struct tcphdr, check);
657 	} else {
658 		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
659 					 csum_partial((char *)th,
660 						      th->doff << 2,
661 						      skb->csum));
662 	}
663 }
664 
665 /*
666  *	This routine will send an RST to the other tcp.
667  *
668  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
669  *		      for reset.
670  *	Answer: if a packet caused RST, it is not for a socket
671  *		existing in our system, if it is matched to a socket,
672  *		it is just duplicate segment or bug in other side's TCP.
673  *		So that we build reply only basing on parameters
674  *		arrived with segment.
675  *	Exception: precedence violation. We do not implement it in any case.
676  */
677 
678 static void tcp_v4_send_reset(struct sk_buff *skb)
679 {
680 	struct tcphdr *th = skb->h.th;
681 	struct tcphdr rth;
682 	struct ip_reply_arg arg;
683 
684 	/* Never send a reset in response to a reset. */
685 	if (th->rst)
686 		return;
687 
688 	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
689 		return;
690 
691 	/* Swap the send and the receive. */
692 	memset(&rth, 0, sizeof(struct tcphdr));
693 	rth.dest   = th->source;
694 	rth.source = th->dest;
695 	rth.doff   = sizeof(struct tcphdr) / 4;
696 	rth.rst    = 1;
697 
698 	if (th->ack) {
699 		rth.seq = th->ack_seq;
700 	} else {
701 		rth.ack = 1;
702 		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
703 				    skb->len - (th->doff << 2));
704 	}
705 
706 	memset(&arg, 0, sizeof arg);
707 	arg.iov[0].iov_base = (unsigned char *)&rth;
708 	arg.iov[0].iov_len  = sizeof rth;
709 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
710 				      skb->nh.iph->saddr, /*XXX*/
711 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
712 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
713 
714 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
715 
716 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
717 	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
718 }
719 
720 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
721    outside socket context is ugly, certainly. What can I do?
722  */
723 
724 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
725 			    u32 win, u32 ts)
726 {
727 	struct tcphdr *th = skb->h.th;
728 	struct {
729 		struct tcphdr th;
730 		u32 tsopt[3];
731 	} rep;
732 	struct ip_reply_arg arg;
733 
734 	memset(&rep.th, 0, sizeof(struct tcphdr));
735 	memset(&arg, 0, sizeof arg);
736 
737 	arg.iov[0].iov_base = (unsigned char *)&rep;
738 	arg.iov[0].iov_len  = sizeof(rep.th);
739 	if (ts) {
740 		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
741 				     (TCPOPT_TIMESTAMP << 8) |
742 				     TCPOLEN_TIMESTAMP);
743 		rep.tsopt[1] = htonl(tcp_time_stamp);
744 		rep.tsopt[2] = htonl(ts);
745 		arg.iov[0].iov_len = sizeof(rep);
746 	}
747 
748 	/* Swap the send and the receive. */
749 	rep.th.dest    = th->source;
750 	rep.th.source  = th->dest;
751 	rep.th.doff    = arg.iov[0].iov_len / 4;
752 	rep.th.seq     = htonl(seq);
753 	rep.th.ack_seq = htonl(ack);
754 	rep.th.ack     = 1;
755 	rep.th.window  = htons(win);
756 
757 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
758 				      skb->nh.iph->saddr, /*XXX*/
759 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
760 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
761 
762 	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
763 
764 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
765 }
766 
767 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
768 {
769 	struct inet_timewait_sock *tw = inet_twsk(sk);
770 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
771 
772 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
773 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
774 
775 	inet_twsk_put(tw);
776 }
777 
778 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
779 {
780 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
781 			req->ts_recent);
782 }
783 
784 /*
785  *	Send a SYN-ACK after having received an ACK.
786  *	This still operates on a request_sock only, not on a big
787  *	socket.
788  */
789 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
790 			      struct dst_entry *dst)
791 {
792 	const struct inet_request_sock *ireq = inet_rsk(req);
793 	int err = -1;
794 	struct sk_buff * skb;
795 
796 	/* First, grab a route. */
797 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
798 		goto out;
799 
800 	skb = tcp_make_synack(sk, dst, req);
801 
802 	if (skb) {
803 		struct tcphdr *th = skb->h.th;
804 
805 		th->check = tcp_v4_check(th, skb->len,
806 					 ireq->loc_addr,
807 					 ireq->rmt_addr,
808 					 csum_partial((char *)th, skb->len,
809 						      skb->csum));
810 
811 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
812 					    ireq->rmt_addr,
813 					    ireq->opt);
814 		if (err == NET_XMIT_CN)
815 			err = 0;
816 	}
817 
818 out:
819 	dst_release(dst);
820 	return err;
821 }
822 
823 /*
824  *	IPv4 request_sock destructor.
825  */
826 static void tcp_v4_reqsk_destructor(struct request_sock *req)
827 {
828 	if (inet_rsk(req)->opt)
829 		kfree(inet_rsk(req)->opt);
830 }
831 
832 static inline void syn_flood_warning(struct sk_buff *skb)
833 {
834 	static unsigned long warntime;
835 
836 	if (time_after(jiffies, (warntime + HZ * 60))) {
837 		warntime = jiffies;
838 		printk(KERN_INFO
839 		       "possible SYN flooding on port %d. Sending cookies.\n",
840 		       ntohs(skb->h.th->dest));
841 	}
842 }
843 
844 /*
845  * Save and compile IPv4 options into the request_sock if needed.
846  */
847 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
848 						     struct sk_buff *skb)
849 {
850 	struct ip_options *opt = &(IPCB(skb)->opt);
851 	struct ip_options *dopt = NULL;
852 
853 	if (opt && opt->optlen) {
854 		int opt_size = optlength(opt);
855 		dopt = kmalloc(opt_size, GFP_ATOMIC);
856 		if (dopt) {
857 			if (ip_options_echo(dopt, skb)) {
858 				kfree(dopt);
859 				dopt = NULL;
860 			}
861 		}
862 	}
863 	return dopt;
864 }
865 
866 struct request_sock_ops tcp_request_sock_ops = {
867 	.family		=	PF_INET,
868 	.obj_size	=	sizeof(struct tcp_request_sock),
869 	.rtx_syn_ack	=	tcp_v4_send_synack,
870 	.send_ack	=	tcp_v4_reqsk_send_ack,
871 	.destructor	=	tcp_v4_reqsk_destructor,
872 	.send_reset	=	tcp_v4_send_reset,
873 };
874 
875 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
876 {
877 	struct inet_request_sock *ireq;
878 	struct tcp_options_received tmp_opt;
879 	struct request_sock *req;
880 	__u32 saddr = skb->nh.iph->saddr;
881 	__u32 daddr = skb->nh.iph->daddr;
882 	__u32 isn = TCP_SKB_CB(skb)->when;
883 	struct dst_entry *dst = NULL;
884 #ifdef CONFIG_SYN_COOKIES
885 	int want_cookie = 0;
886 #else
887 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
888 #endif
889 
890 	/* Never answer to SYNs send to broadcast or multicast */
891 	if (((struct rtable *)skb->dst)->rt_flags &
892 	    (RTCF_BROADCAST | RTCF_MULTICAST))
893 		goto drop;
894 
895 	/* TW buckets are converted to open requests without
896 	 * limitations, they conserve resources and peer is
897 	 * evidently real one.
898 	 */
899 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
900 #ifdef CONFIG_SYN_COOKIES
901 		if (sysctl_tcp_syncookies) {
902 			want_cookie = 1;
903 		} else
904 #endif
905 		goto drop;
906 	}
907 
908 	/* Accept backlog is full. If we have already queued enough
909 	 * of warm entries in syn queue, drop request. It is better than
910 	 * clogging syn queue with openreqs with exponentially increasing
911 	 * timeout.
912 	 */
913 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
914 		goto drop;
915 
916 	req = reqsk_alloc(&tcp_request_sock_ops);
917 	if (!req)
918 		goto drop;
919 
920 	tcp_clear_options(&tmp_opt);
921 	tmp_opt.mss_clamp = 536;
922 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
923 
924 	tcp_parse_options(skb, &tmp_opt, 0);
925 
926 	if (want_cookie) {
927 		tcp_clear_options(&tmp_opt);
928 		tmp_opt.saw_tstamp = 0;
929 	}
930 
931 	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
932 		/* Some OSes (unknown ones, but I see them on web server, which
933 		 * contains information interesting only for windows'
934 		 * users) do not send their stamp in SYN. It is easy case.
935 		 * We simply do not advertise TS support.
936 		 */
937 		tmp_opt.saw_tstamp = 0;
938 		tmp_opt.tstamp_ok  = 0;
939 	}
940 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
941 
942 	tcp_openreq_init(req, &tmp_opt, skb);
943 
944 	ireq = inet_rsk(req);
945 	ireq->loc_addr = daddr;
946 	ireq->rmt_addr = saddr;
947 	ireq->opt = tcp_v4_save_options(sk, skb);
948 	if (!want_cookie)
949 		TCP_ECN_create_request(req, skb->h.th);
950 
951 	if (want_cookie) {
952 #ifdef CONFIG_SYN_COOKIES
953 		syn_flood_warning(skb);
954 #endif
955 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
956 	} else if (!isn) {
957 		struct inet_peer *peer = NULL;
958 
959 		/* VJ's idea. We save last timestamp seen
960 		 * from the destination in peer table, when entering
961 		 * state TIME-WAIT, and check against it before
962 		 * accepting new connection request.
963 		 *
964 		 * If "isn" is not zero, this request hit alive
965 		 * timewait bucket, so that all the necessary checks
966 		 * are made in the function processing timewait state.
967 		 */
968 		if (tmp_opt.saw_tstamp &&
969 		    tcp_death_row.sysctl_tw_recycle &&
970 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
971 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
972 		    peer->v4daddr == saddr) {
973 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
974 			    (s32)(peer->tcp_ts - req->ts_recent) >
975 							TCP_PAWS_WINDOW) {
976 				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
977 				dst_release(dst);
978 				goto drop_and_free;
979 			}
980 		}
981 		/* Kill the following clause, if you dislike this way. */
982 		else if (!sysctl_tcp_syncookies &&
983 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
984 			  (sysctl_max_syn_backlog >> 2)) &&
985 			 (!peer || !peer->tcp_ts_stamp) &&
986 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
987 			/* Without syncookies last quarter of
988 			 * backlog is filled with destinations,
989 			 * proven to be alive.
990 			 * It means that we continue to communicate
991 			 * to destinations, already remembered
992 			 * to the moment of synflood.
993 			 */
994 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
995 				       "request from %u.%u.%u.%u/%u\n",
996 				       NIPQUAD(saddr),
997 				       ntohs(skb->h.th->source));
998 			dst_release(dst);
999 			goto drop_and_free;
1000 		}
1001 
1002 		isn = tcp_v4_init_sequence(sk, skb);
1003 	}
1004 	tcp_rsk(req)->snt_isn = isn;
1005 
1006 	if (tcp_v4_send_synack(sk, req, dst))
1007 		goto drop_and_free;
1008 
1009 	if (want_cookie) {
1010 	   	reqsk_free(req);
1011 	} else {
1012 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1013 	}
1014 	return 0;
1015 
1016 drop_and_free:
1017 	reqsk_free(req);
1018 drop:
1019 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1020 	return 0;
1021 }
1022 
1023 
1024 /*
1025  * The three way handshake has completed - we got a valid synack -
1026  * now create the new socket.
1027  */
1028 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1029 				  struct request_sock *req,
1030 				  struct dst_entry *dst)
1031 {
1032 	struct inet_request_sock *ireq;
1033 	struct inet_sock *newinet;
1034 	struct tcp_sock *newtp;
1035 	struct sock *newsk;
1036 
1037 	if (sk_acceptq_is_full(sk))
1038 		goto exit_overflow;
1039 
1040 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1041 		goto exit;
1042 
1043 	newsk = tcp_create_openreq_child(sk, req, skb);
1044 	if (!newsk)
1045 		goto exit;
1046 
1047 	sk_setup_caps(newsk, dst);
1048 
1049 	newtp		      = tcp_sk(newsk);
1050 	newinet		      = inet_sk(newsk);
1051 	ireq		      = inet_rsk(req);
1052 	newinet->daddr	      = ireq->rmt_addr;
1053 	newinet->rcv_saddr    = ireq->loc_addr;
1054 	newinet->saddr	      = ireq->loc_addr;
1055 	newinet->opt	      = ireq->opt;
1056 	ireq->opt	      = NULL;
1057 	newinet->mc_index     = inet_iif(skb);
1058 	newinet->mc_ttl	      = skb->nh.iph->ttl;
1059 	newtp->ext_header_len = 0;
1060 	if (newinet->opt)
1061 		newtp->ext_header_len = newinet->opt->optlen;
1062 	newinet->id = newtp->write_seq ^ jiffies;
1063 
1064 	tcp_sync_mss(newsk, dst_mtu(dst));
1065 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1066 	tcp_initialize_rcv_mss(newsk);
1067 
1068 	__inet_hash(&tcp_hashinfo, newsk, 0);
1069 	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
1070 
1071 	return newsk;
1072 
1073 exit_overflow:
1074 	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1075 exit:
1076 	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1077 	dst_release(dst);
1078 	return NULL;
1079 }
1080 
1081 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1082 {
1083 	struct tcphdr *th = skb->h.th;
1084 	struct iphdr *iph = skb->nh.iph;
1085 	struct sock *nsk;
1086 	struct request_sock **prev;
1087 	/* Find possible connection requests. */
1088 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1089 						       iph->saddr, iph->daddr);
1090 	if (req)
1091 		return tcp_check_req(sk, skb, req, prev);
1092 
1093 	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1094 					th->source, skb->nh.iph->daddr,
1095 					ntohs(th->dest), inet_iif(skb));
1096 
1097 	if (nsk) {
1098 		if (nsk->sk_state != TCP_TIME_WAIT) {
1099 			bh_lock_sock(nsk);
1100 			return nsk;
1101 		}
1102 		inet_twsk_put((struct inet_timewait_sock *)nsk);
1103 		return NULL;
1104 	}
1105 
1106 #ifdef CONFIG_SYN_COOKIES
1107 	if (!th->rst && !th->syn && th->ack)
1108 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1109 #endif
1110 	return sk;
1111 }
1112 
1113 static int tcp_v4_checksum_init(struct sk_buff *skb)
1114 {
1115 	if (skb->ip_summed == CHECKSUM_HW) {
1116 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1117 		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1118 				  skb->nh.iph->daddr, skb->csum))
1119 			return 0;
1120 
1121 		LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1122 		skb->ip_summed = CHECKSUM_NONE;
1123 	}
1124 	if (skb->len <= 76) {
1125 		if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1126 				 skb->nh.iph->daddr,
1127 				 skb_checksum(skb, 0, skb->len, 0)))
1128 			return -1;
1129 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1130 	} else {
1131 		skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1132 					  skb->nh.iph->saddr,
1133 					  skb->nh.iph->daddr, 0);
1134 	}
1135 	return 0;
1136 }
1137 
1138 
1139 /* The socket must have it's spinlock held when we get
1140  * here.
1141  *
1142  * We have a potential double-lock case here, so even when
1143  * doing backlog processing we use the BH locking scheme.
1144  * This is because we cannot sleep with the original spinlock
1145  * held.
1146  */
1147 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1148 {
1149 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1150 		TCP_CHECK_TIMER(sk);
1151 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1152 			goto reset;
1153 		TCP_CHECK_TIMER(sk);
1154 		return 0;
1155 	}
1156 
1157 	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1158 		goto csum_err;
1159 
1160 	if (sk->sk_state == TCP_LISTEN) {
1161 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1162 		if (!nsk)
1163 			goto discard;
1164 
1165 		if (nsk != sk) {
1166 			if (tcp_child_process(sk, nsk, skb))
1167 				goto reset;
1168 			return 0;
1169 		}
1170 	}
1171 
1172 	TCP_CHECK_TIMER(sk);
1173 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1174 		goto reset;
1175 	TCP_CHECK_TIMER(sk);
1176 	return 0;
1177 
1178 reset:
1179 	tcp_v4_send_reset(skb);
1180 discard:
1181 	kfree_skb(skb);
1182 	/* Be careful here. If this function gets more complicated and
1183 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1184 	 * might be destroyed here. This current version compiles correctly,
1185 	 * but you have been warned.
1186 	 */
1187 	return 0;
1188 
1189 csum_err:
1190 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1191 	goto discard;
1192 }
1193 
1194 /*
1195  *	From tcp_input.c
1196  */
1197 
1198 int tcp_v4_rcv(struct sk_buff *skb)
1199 {
1200 	struct tcphdr *th;
1201 	struct sock *sk;
1202 	int ret;
1203 
1204 	if (skb->pkt_type != PACKET_HOST)
1205 		goto discard_it;
1206 
1207 	/* Count it even if it's bad */
1208 	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1209 
1210 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1211 		goto discard_it;
1212 
1213 	th = skb->h.th;
1214 
1215 	if (th->doff < sizeof(struct tcphdr) / 4)
1216 		goto bad_packet;
1217 	if (!pskb_may_pull(skb, th->doff * 4))
1218 		goto discard_it;
1219 
1220 	/* An explanation is required here, I think.
1221 	 * Packet length and doff are validated by header prediction,
1222 	 * provided case of th->doff==0 is elimineted.
1223 	 * So, we defer the checks. */
1224 	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1225 	     tcp_v4_checksum_init(skb) < 0))
1226 		goto bad_packet;
1227 
1228 	th = skb->h.th;
1229 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1230 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1231 				    skb->len - th->doff * 4);
1232 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1233 	TCP_SKB_CB(skb)->when	 = 0;
1234 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1235 	TCP_SKB_CB(skb)->sacked	 = 0;
1236 
1237 	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1238 			   skb->nh.iph->daddr, ntohs(th->dest),
1239 			   inet_iif(skb));
1240 
1241 	if (!sk)
1242 		goto no_tcp_socket;
1243 
1244 process:
1245 	if (sk->sk_state == TCP_TIME_WAIT)
1246 		goto do_time_wait;
1247 
1248 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1249 		goto discard_and_relse;
1250 
1251 	if (sk_filter(sk, skb, 0))
1252 		goto discard_and_relse;
1253 
1254 	skb->dev = NULL;
1255 
1256 	bh_lock_sock(sk);
1257 	ret = 0;
1258 	if (!sock_owned_by_user(sk)) {
1259 		if (!tcp_prequeue(sk, skb))
1260 			ret = tcp_v4_do_rcv(sk, skb);
1261 	} else
1262 		sk_add_backlog(sk, skb);
1263 	bh_unlock_sock(sk);
1264 
1265 	sock_put(sk);
1266 
1267 	return ret;
1268 
1269 no_tcp_socket:
1270 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1271 		goto discard_it;
1272 
1273 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1274 bad_packet:
1275 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1276 	} else {
1277 		tcp_v4_send_reset(skb);
1278 	}
1279 
1280 discard_it:
1281 	/* Discard frame. */
1282 	kfree_skb(skb);
1283   	return 0;
1284 
1285 discard_and_relse:
1286 	sock_put(sk);
1287 	goto discard_it;
1288 
1289 do_time_wait:
1290 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1291 		inet_twsk_put((struct inet_timewait_sock *) sk);
1292 		goto discard_it;
1293 	}
1294 
1295 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1296 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1297 		inet_twsk_put((struct inet_timewait_sock *) sk);
1298 		goto discard_it;
1299 	}
1300 	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1301 					   skb, th)) {
1302 	case TCP_TW_SYN: {
1303 		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1304 							skb->nh.iph->daddr,
1305 							ntohs(th->dest),
1306 							inet_iif(skb));
1307 		if (sk2) {
1308 			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1309 					     &tcp_death_row);
1310 			inet_twsk_put((struct inet_timewait_sock *)sk);
1311 			sk = sk2;
1312 			goto process;
1313 		}
1314 		/* Fall through to ACK */
1315 	}
1316 	case TCP_TW_ACK:
1317 		tcp_v4_timewait_ack(sk, skb);
1318 		break;
1319 	case TCP_TW_RST:
1320 		goto no_tcp_socket;
1321 	case TCP_TW_SUCCESS:;
1322 	}
1323 	goto discard_it;
1324 }
1325 
1326 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1327 {
1328 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1329 	struct inet_sock *inet = inet_sk(sk);
1330 
1331 	sin->sin_family		= AF_INET;
1332 	sin->sin_addr.s_addr	= inet->daddr;
1333 	sin->sin_port		= inet->dport;
1334 }
1335 
1336 /* VJ's idea. Save last timestamp seen from this destination
1337  * and hold it at least for normal timewait interval to use for duplicate
1338  * segment detection in subsequent connections, before they enter synchronized
1339  * state.
1340  */
1341 
1342 int tcp_v4_remember_stamp(struct sock *sk)
1343 {
1344 	struct inet_sock *inet = inet_sk(sk);
1345 	struct tcp_sock *tp = tcp_sk(sk);
1346 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1347 	struct inet_peer *peer = NULL;
1348 	int release_it = 0;
1349 
1350 	if (!rt || rt->rt_dst != inet->daddr) {
1351 		peer = inet_getpeer(inet->daddr, 1);
1352 		release_it = 1;
1353 	} else {
1354 		if (!rt->peer)
1355 			rt_bind_peer(rt, 1);
1356 		peer = rt->peer;
1357 	}
1358 
1359 	if (peer) {
1360 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1361 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1362 		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1363 			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1364 			peer->tcp_ts = tp->rx_opt.ts_recent;
1365 		}
1366 		if (release_it)
1367 			inet_putpeer(peer);
1368 		return 1;
1369 	}
1370 
1371 	return 0;
1372 }
1373 
1374 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1375 {
1376 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1377 
1378 	if (peer) {
1379 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1380 
1381 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1382 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1383 		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1384 			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1385 			peer->tcp_ts	   = tcptw->tw_ts_recent;
1386 		}
1387 		inet_putpeer(peer);
1388 		return 1;
1389 	}
1390 
1391 	return 0;
1392 }
1393 
1394 struct tcp_func ipv4_specific = {
1395 	.queue_xmit	=	ip_queue_xmit,
1396 	.send_check	=	tcp_v4_send_check,
1397 	.rebuild_header	=	inet_sk_rebuild_header,
1398 	.conn_request	=	tcp_v4_conn_request,
1399 	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
1400 	.remember_stamp	=	tcp_v4_remember_stamp,
1401 	.net_header_len	=	sizeof(struct iphdr),
1402 	.setsockopt	=	ip_setsockopt,
1403 	.getsockopt	=	ip_getsockopt,
1404 	.addr2sockaddr	=	v4_addr2sockaddr,
1405 	.sockaddr_len	=	sizeof(struct sockaddr_in),
1406 };
1407 
1408 /* NOTE: A lot of things set to zero explicitly by call to
1409  *       sk_alloc() so need not be done here.
1410  */
1411 static int tcp_v4_init_sock(struct sock *sk)
1412 {
1413 	struct inet_connection_sock *icsk = inet_csk(sk);
1414 	struct tcp_sock *tp = tcp_sk(sk);
1415 
1416 	skb_queue_head_init(&tp->out_of_order_queue);
1417 	tcp_init_xmit_timers(sk);
1418 	tcp_prequeue_init(tp);
1419 
1420 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1421 	tp->mdev = TCP_TIMEOUT_INIT;
1422 
1423 	/* So many TCP implementations out there (incorrectly) count the
1424 	 * initial SYN frame in their delayed-ACK and congestion control
1425 	 * algorithms that we must have the following bandaid to talk
1426 	 * efficiently to them.  -DaveM
1427 	 */
1428 	tp->snd_cwnd = 2;
1429 
1430 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1431 	 * initialization of these values.
1432 	 */
1433 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1434 	tp->snd_cwnd_clamp = ~0;
1435 	tp->mss_cache = 536;
1436 
1437 	tp->reordering = sysctl_tcp_reordering;
1438 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1439 
1440 	sk->sk_state = TCP_CLOSE;
1441 
1442 	sk->sk_write_space = sk_stream_write_space;
1443 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1444 
1445 	tp->af_specific = &ipv4_specific;
1446 
1447 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1448 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1449 
1450 	atomic_inc(&tcp_sockets_allocated);
1451 
1452 	return 0;
1453 }
1454 
1455 int tcp_v4_destroy_sock(struct sock *sk)
1456 {
1457 	struct tcp_sock *tp = tcp_sk(sk);
1458 
1459 	tcp_clear_xmit_timers(sk);
1460 
1461 	tcp_cleanup_congestion_control(sk);
1462 
1463 	/* Cleanup up the write buffer. */
1464   	sk_stream_writequeue_purge(sk);
1465 
1466 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1467   	__skb_queue_purge(&tp->out_of_order_queue);
1468 
1469 	/* Clean prequeue, it must be empty really */
1470 	__skb_queue_purge(&tp->ucopy.prequeue);
1471 
1472 	/* Clean up a referenced TCP bind bucket. */
1473 	if (inet_csk(sk)->icsk_bind_hash)
1474 		inet_put_port(&tcp_hashinfo, sk);
1475 
1476 	/*
1477 	 * If sendmsg cached page exists, toss it.
1478 	 */
1479 	if (sk->sk_sndmsg_page) {
1480 		__free_page(sk->sk_sndmsg_page);
1481 		sk->sk_sndmsg_page = NULL;
1482 	}
1483 
1484 	atomic_dec(&tcp_sockets_allocated);
1485 
1486 	return 0;
1487 }
1488 
1489 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1490 
1491 #ifdef CONFIG_PROC_FS
1492 /* Proc filesystem TCP sock list dumping. */
1493 
1494 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1495 {
1496 	return hlist_empty(head) ? NULL :
1497 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1498 }
1499 
1500 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1501 {
1502 	return tw->tw_node.next ?
1503 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1504 }
1505 
1506 static void *listening_get_next(struct seq_file *seq, void *cur)
1507 {
1508 	struct inet_connection_sock *icsk;
1509 	struct hlist_node *node;
1510 	struct sock *sk = cur;
1511 	struct tcp_iter_state* st = seq->private;
1512 
1513 	if (!sk) {
1514 		st->bucket = 0;
1515 		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1516 		goto get_sk;
1517 	}
1518 
1519 	++st->num;
1520 
1521 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1522 		struct request_sock *req = cur;
1523 
1524 	       	icsk = inet_csk(st->syn_wait_sk);
1525 		req = req->dl_next;
1526 		while (1) {
1527 			while (req) {
1528 				if (req->rsk_ops->family == st->family) {
1529 					cur = req;
1530 					goto out;
1531 				}
1532 				req = req->dl_next;
1533 			}
1534 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1535 				break;
1536 get_req:
1537 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1538 		}
1539 		sk	  = sk_next(st->syn_wait_sk);
1540 		st->state = TCP_SEQ_STATE_LISTENING;
1541 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1542 	} else {
1543 	       	icsk = inet_csk(sk);
1544 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1545 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1546 			goto start_req;
1547 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1548 		sk = sk_next(sk);
1549 	}
1550 get_sk:
1551 	sk_for_each_from(sk, node) {
1552 		if (sk->sk_family == st->family) {
1553 			cur = sk;
1554 			goto out;
1555 		}
1556 	       	icsk = inet_csk(sk);
1557 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1558 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1559 start_req:
1560 			st->uid		= sock_i_uid(sk);
1561 			st->syn_wait_sk = sk;
1562 			st->state	= TCP_SEQ_STATE_OPENREQ;
1563 			st->sbucket	= 0;
1564 			goto get_req;
1565 		}
1566 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1567 	}
1568 	if (++st->bucket < INET_LHTABLE_SIZE) {
1569 		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1570 		goto get_sk;
1571 	}
1572 	cur = NULL;
1573 out:
1574 	return cur;
1575 }
1576 
1577 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1578 {
1579 	void *rc = listening_get_next(seq, NULL);
1580 
1581 	while (rc && *pos) {
1582 		rc = listening_get_next(seq, rc);
1583 		--*pos;
1584 	}
1585 	return rc;
1586 }
1587 
1588 static void *established_get_first(struct seq_file *seq)
1589 {
1590 	struct tcp_iter_state* st = seq->private;
1591 	void *rc = NULL;
1592 
1593 	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1594 		struct sock *sk;
1595 		struct hlist_node *node;
1596 		struct inet_timewait_sock *tw;
1597 
1598 		/* We can reschedule _before_ having picked the target: */
1599 		cond_resched_softirq();
1600 
1601 		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1602 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1603 			if (sk->sk_family != st->family) {
1604 				continue;
1605 			}
1606 			rc = sk;
1607 			goto out;
1608 		}
1609 		st->state = TCP_SEQ_STATE_TIME_WAIT;
1610 		inet_twsk_for_each(tw, node,
1611 				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1612 			if (tw->tw_family != st->family) {
1613 				continue;
1614 			}
1615 			rc = tw;
1616 			goto out;
1617 		}
1618 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1619 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1620 	}
1621 out:
1622 	return rc;
1623 }
1624 
1625 static void *established_get_next(struct seq_file *seq, void *cur)
1626 {
1627 	struct sock *sk = cur;
1628 	struct inet_timewait_sock *tw;
1629 	struct hlist_node *node;
1630 	struct tcp_iter_state* st = seq->private;
1631 
1632 	++st->num;
1633 
1634 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1635 		tw = cur;
1636 		tw = tw_next(tw);
1637 get_tw:
1638 		while (tw && tw->tw_family != st->family) {
1639 			tw = tw_next(tw);
1640 		}
1641 		if (tw) {
1642 			cur = tw;
1643 			goto out;
1644 		}
1645 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1646 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1647 
1648 		/* We can reschedule between buckets: */
1649 		cond_resched_softirq();
1650 
1651 		if (++st->bucket < tcp_hashinfo.ehash_size) {
1652 			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1653 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1654 		} else {
1655 			cur = NULL;
1656 			goto out;
1657 		}
1658 	} else
1659 		sk = sk_next(sk);
1660 
1661 	sk_for_each_from(sk, node) {
1662 		if (sk->sk_family == st->family)
1663 			goto found;
1664 	}
1665 
1666 	st->state = TCP_SEQ_STATE_TIME_WAIT;
1667 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1668 	goto get_tw;
1669 found:
1670 	cur = sk;
1671 out:
1672 	return cur;
1673 }
1674 
1675 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1676 {
1677 	void *rc = established_get_first(seq);
1678 
1679 	while (rc && pos) {
1680 		rc = established_get_next(seq, rc);
1681 		--pos;
1682 	}
1683 	return rc;
1684 }
1685 
1686 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1687 {
1688 	void *rc;
1689 	struct tcp_iter_state* st = seq->private;
1690 
1691 	inet_listen_lock(&tcp_hashinfo);
1692 	st->state = TCP_SEQ_STATE_LISTENING;
1693 	rc	  = listening_get_idx(seq, &pos);
1694 
1695 	if (!rc) {
1696 		inet_listen_unlock(&tcp_hashinfo);
1697 		local_bh_disable();
1698 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1699 		rc	  = established_get_idx(seq, pos);
1700 	}
1701 
1702 	return rc;
1703 }
1704 
1705 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1706 {
1707 	struct tcp_iter_state* st = seq->private;
1708 	st->state = TCP_SEQ_STATE_LISTENING;
1709 	st->num = 0;
1710 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1711 }
1712 
1713 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1714 {
1715 	void *rc = NULL;
1716 	struct tcp_iter_state* st;
1717 
1718 	if (v == SEQ_START_TOKEN) {
1719 		rc = tcp_get_idx(seq, 0);
1720 		goto out;
1721 	}
1722 	st = seq->private;
1723 
1724 	switch (st->state) {
1725 	case TCP_SEQ_STATE_OPENREQ:
1726 	case TCP_SEQ_STATE_LISTENING:
1727 		rc = listening_get_next(seq, v);
1728 		if (!rc) {
1729 			inet_listen_unlock(&tcp_hashinfo);
1730 			local_bh_disable();
1731 			st->state = TCP_SEQ_STATE_ESTABLISHED;
1732 			rc	  = established_get_first(seq);
1733 		}
1734 		break;
1735 	case TCP_SEQ_STATE_ESTABLISHED:
1736 	case TCP_SEQ_STATE_TIME_WAIT:
1737 		rc = established_get_next(seq, v);
1738 		break;
1739 	}
1740 out:
1741 	++*pos;
1742 	return rc;
1743 }
1744 
1745 static void tcp_seq_stop(struct seq_file *seq, void *v)
1746 {
1747 	struct tcp_iter_state* st = seq->private;
1748 
1749 	switch (st->state) {
1750 	case TCP_SEQ_STATE_OPENREQ:
1751 		if (v) {
1752 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1753 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1754 		}
1755 	case TCP_SEQ_STATE_LISTENING:
1756 		if (v != SEQ_START_TOKEN)
1757 			inet_listen_unlock(&tcp_hashinfo);
1758 		break;
1759 	case TCP_SEQ_STATE_TIME_WAIT:
1760 	case TCP_SEQ_STATE_ESTABLISHED:
1761 		if (v)
1762 			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1763 		local_bh_enable();
1764 		break;
1765 	}
1766 }
1767 
1768 static int tcp_seq_open(struct inode *inode, struct file *file)
1769 {
1770 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1771 	struct seq_file *seq;
1772 	struct tcp_iter_state *s;
1773 	int rc;
1774 
1775 	if (unlikely(afinfo == NULL))
1776 		return -EINVAL;
1777 
1778 	s = kmalloc(sizeof(*s), GFP_KERNEL);
1779 	if (!s)
1780 		return -ENOMEM;
1781 	memset(s, 0, sizeof(*s));
1782 	s->family		= afinfo->family;
1783 	s->seq_ops.start	= tcp_seq_start;
1784 	s->seq_ops.next		= tcp_seq_next;
1785 	s->seq_ops.show		= afinfo->seq_show;
1786 	s->seq_ops.stop		= tcp_seq_stop;
1787 
1788 	rc = seq_open(file, &s->seq_ops);
1789 	if (rc)
1790 		goto out_kfree;
1791 	seq	     = file->private_data;
1792 	seq->private = s;
1793 out:
1794 	return rc;
1795 out_kfree:
1796 	kfree(s);
1797 	goto out;
1798 }
1799 
1800 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1801 {
1802 	int rc = 0;
1803 	struct proc_dir_entry *p;
1804 
1805 	if (!afinfo)
1806 		return -EINVAL;
1807 	afinfo->seq_fops->owner		= afinfo->owner;
1808 	afinfo->seq_fops->open		= tcp_seq_open;
1809 	afinfo->seq_fops->read		= seq_read;
1810 	afinfo->seq_fops->llseek	= seq_lseek;
1811 	afinfo->seq_fops->release	= seq_release_private;
1812 
1813 	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1814 	if (p)
1815 		p->data = afinfo;
1816 	else
1817 		rc = -ENOMEM;
1818 	return rc;
1819 }
1820 
1821 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1822 {
1823 	if (!afinfo)
1824 		return;
1825 	proc_net_remove(afinfo->name);
1826 	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1827 }
1828 
1829 static void get_openreq4(struct sock *sk, struct request_sock *req,
1830 			 char *tmpbuf, int i, int uid)
1831 {
1832 	const struct inet_request_sock *ireq = inet_rsk(req);
1833 	int ttd = req->expires - jiffies;
1834 
1835 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1836 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1837 		i,
1838 		ireq->loc_addr,
1839 		ntohs(inet_sk(sk)->sport),
1840 		ireq->rmt_addr,
1841 		ntohs(ireq->rmt_port),
1842 		TCP_SYN_RECV,
1843 		0, 0, /* could print option size, but that is af dependent. */
1844 		1,    /* timers active (only the expire timer) */
1845 		jiffies_to_clock_t(ttd),
1846 		req->retrans,
1847 		uid,
1848 		0,  /* non standard timer */
1849 		0, /* open_requests have no inode */
1850 		atomic_read(&sk->sk_refcnt),
1851 		req);
1852 }
1853 
1854 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1855 {
1856 	int timer_active;
1857 	unsigned long timer_expires;
1858 	struct tcp_sock *tp = tcp_sk(sp);
1859 	const struct inet_connection_sock *icsk = inet_csk(sp);
1860 	struct inet_sock *inet = inet_sk(sp);
1861 	unsigned int dest = inet->daddr;
1862 	unsigned int src = inet->rcv_saddr;
1863 	__u16 destp = ntohs(inet->dport);
1864 	__u16 srcp = ntohs(inet->sport);
1865 
1866 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1867 		timer_active	= 1;
1868 		timer_expires	= icsk->icsk_timeout;
1869 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1870 		timer_active	= 4;
1871 		timer_expires	= icsk->icsk_timeout;
1872 	} else if (timer_pending(&sp->sk_timer)) {
1873 		timer_active	= 2;
1874 		timer_expires	= sp->sk_timer.expires;
1875 	} else {
1876 		timer_active	= 0;
1877 		timer_expires = jiffies;
1878 	}
1879 
1880 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1881 			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1882 		i, src, srcp, dest, destp, sp->sk_state,
1883 		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1884 		timer_active,
1885 		jiffies_to_clock_t(timer_expires - jiffies),
1886 		icsk->icsk_retransmits,
1887 		sock_i_uid(sp),
1888 		icsk->icsk_probes_out,
1889 		sock_i_ino(sp),
1890 		atomic_read(&sp->sk_refcnt), sp,
1891 		icsk->icsk_rto,
1892 		icsk->icsk_ack.ato,
1893 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1894 		tp->snd_cwnd,
1895 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1896 }
1897 
1898 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1899 {
1900 	unsigned int dest, src;
1901 	__u16 destp, srcp;
1902 	int ttd = tw->tw_ttd - jiffies;
1903 
1904 	if (ttd < 0)
1905 		ttd = 0;
1906 
1907 	dest  = tw->tw_daddr;
1908 	src   = tw->tw_rcv_saddr;
1909 	destp = ntohs(tw->tw_dport);
1910 	srcp  = ntohs(tw->tw_sport);
1911 
1912 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1913 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1914 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1915 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1916 		atomic_read(&tw->tw_refcnt), tw);
1917 }
1918 
1919 #define TMPSZ 150
1920 
1921 static int tcp4_seq_show(struct seq_file *seq, void *v)
1922 {
1923 	struct tcp_iter_state* st;
1924 	char tmpbuf[TMPSZ + 1];
1925 
1926 	if (v == SEQ_START_TOKEN) {
1927 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1928 			   "  sl  local_address rem_address   st tx_queue "
1929 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1930 			   "inode");
1931 		goto out;
1932 	}
1933 	st = seq->private;
1934 
1935 	switch (st->state) {
1936 	case TCP_SEQ_STATE_LISTENING:
1937 	case TCP_SEQ_STATE_ESTABLISHED:
1938 		get_tcp4_sock(v, tmpbuf, st->num);
1939 		break;
1940 	case TCP_SEQ_STATE_OPENREQ:
1941 		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1942 		break;
1943 	case TCP_SEQ_STATE_TIME_WAIT:
1944 		get_timewait4_sock(v, tmpbuf, st->num);
1945 		break;
1946 	}
1947 	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1948 out:
1949 	return 0;
1950 }
1951 
1952 static struct file_operations tcp4_seq_fops;
1953 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1954 	.owner		= THIS_MODULE,
1955 	.name		= "tcp",
1956 	.family		= AF_INET,
1957 	.seq_show	= tcp4_seq_show,
1958 	.seq_fops	= &tcp4_seq_fops,
1959 };
1960 
1961 int __init tcp4_proc_init(void)
1962 {
1963 	return tcp_proc_register(&tcp4_seq_afinfo);
1964 }
1965 
1966 void tcp4_proc_exit(void)
1967 {
1968 	tcp_proc_unregister(&tcp4_seq_afinfo);
1969 }
1970 #endif /* CONFIG_PROC_FS */
1971 
1972 struct proto tcp_prot = {
1973 	.name			= "TCP",
1974 	.owner			= THIS_MODULE,
1975 	.close			= tcp_close,
1976 	.connect		= tcp_v4_connect,
1977 	.disconnect		= tcp_disconnect,
1978 	.accept			= inet_csk_accept,
1979 	.ioctl			= tcp_ioctl,
1980 	.init			= tcp_v4_init_sock,
1981 	.destroy		= tcp_v4_destroy_sock,
1982 	.shutdown		= tcp_shutdown,
1983 	.setsockopt		= tcp_setsockopt,
1984 	.getsockopt		= tcp_getsockopt,
1985 	.sendmsg		= tcp_sendmsg,
1986 	.recvmsg		= tcp_recvmsg,
1987 	.backlog_rcv		= tcp_v4_do_rcv,
1988 	.hash			= tcp_v4_hash,
1989 	.unhash			= tcp_unhash,
1990 	.get_port		= tcp_v4_get_port,
1991 	.enter_memory_pressure	= tcp_enter_memory_pressure,
1992 	.sockets_allocated	= &tcp_sockets_allocated,
1993 	.orphan_count		= &tcp_orphan_count,
1994 	.memory_allocated	= &tcp_memory_allocated,
1995 	.memory_pressure	= &tcp_memory_pressure,
1996 	.sysctl_mem		= sysctl_tcp_mem,
1997 	.sysctl_wmem		= sysctl_tcp_wmem,
1998 	.sysctl_rmem		= sysctl_tcp_rmem,
1999 	.max_header		= MAX_TCP_HEADER,
2000 	.obj_size		= sizeof(struct tcp_sock),
2001 	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
2002 	.rsk_prot		= &tcp_request_sock_ops,
2003 };
2004 
2005 
2006 
2007 void __init tcp_v4_init(struct net_proto_family *ops)
2008 {
2009 	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2010 	if (err < 0)
2011 		panic("Failed to create the TCP control socket.\n");
2012 	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2013 	inet_sk(tcp_socket->sk)->uc_ttl = -1;
2014 
2015 	/* Unhash it so that IP input processing does not even
2016 	 * see it, we do not wish this socket to see incoming
2017 	 * packets.
2018 	 */
2019 	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2020 }
2021 
2022 EXPORT_SYMBOL(ipv4_specific);
2023 EXPORT_SYMBOL(inet_bind_bucket_create);
2024 EXPORT_SYMBOL(tcp_hashinfo);
2025 EXPORT_SYMBOL(tcp_prot);
2026 EXPORT_SYMBOL(tcp_unhash);
2027 EXPORT_SYMBOL(tcp_v4_conn_request);
2028 EXPORT_SYMBOL(tcp_v4_connect);
2029 EXPORT_SYMBOL(tcp_v4_do_rcv);
2030 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2031 EXPORT_SYMBOL(tcp_v4_send_check);
2032 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2033 
2034 #ifdef CONFIG_PROC_FS
2035 EXPORT_SYMBOL(tcp_proc_register);
2036 EXPORT_SYMBOL(tcp_proc_unregister);
2037 #endif
2038 EXPORT_SYMBOL(sysctl_local_port_range);
2039 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2040 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2041 
2042