xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision d67b569f)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *		IPv4 specific functions
11  *
12  *
13  *		code split from:
14  *		linux/ipv4/tcp.c
15  *		linux/ipv4/tcp_input.c
16  *		linux/ipv4/tcp_output.c
17  *
18  *		See tcp.c for author information
19  *
20  *	This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25 
26 /*
27  * Changes:
28  *		David S. Miller	:	New socket lookup architecture.
29  *					This code is dedicated to John Dyson.
30  *		David S. Miller :	Change semantics of established hash,
31  *					half is devoted to TIME_WAIT sockets
32  *					and the rest go in the other half.
33  *		Andi Kleen :		Add support for syncookies and fixed
34  *					some bugs: ip options weren't passed to
35  *					the TCP layer, missed a check for an
36  *					ACK bit.
37  *		Andi Kleen :		Implemented fast path mtu discovery.
38  *	     				Fixed many serious bugs in the
39  *					request_sock handling and moved
40  *					most of it into the af independent code.
41  *					Added tail drop and some other bugfixes.
42  *					Added new listen sematics.
43  *		Mike McLagan	:	Routing by source
44  *	Juan Jose Ciarlante:		ip_dynaddr bits
45  *		Andi Kleen:		various fixes.
46  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47  *					coma.
48  *	Andi Kleen		:	Fix new listen.
49  *	Andi Kleen		:	Fix accept error reporting.
50  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52  *					a single port at the same time.
53  */
54 
55 #include <linux/config.h>
56 
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65 
66 #include <net/icmp.h>
67 #include <net/tcp.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/xfrm.h>
71 
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
81 
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
84 
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
87 
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 		       struct sk_buff *skb);
90 
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 	.__tcp_lhash_lock	=	RW_LOCK_UNLOCKED,
93 	.__tcp_lhash_users	=	ATOMIC_INIT(0),
94 	.__tcp_lhash_wait
95 	  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 	.__tcp_portalloc_lock	=	SPIN_LOCK_UNLOCKED
97 };
98 
99 /*
100  * This array holds the first and last local port number.
101  * For high-usage systems, use sysctl to change this to
102  * 32768-61000
103  */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106 
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 				 __u32 faddr, __u16 fport)
109 {
110 	int h = (laddr ^ lport) ^ (faddr ^ fport);
111 	h ^= h >> 16;
112 	h ^= h >> 8;
113 	return h & (tcp_ehash_size - 1);
114 }
115 
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
117 {
118 	struct inet_sock *inet = inet_sk(sk);
119 	__u32 laddr = inet->rcv_saddr;
120 	__u16 lport = inet->num;
121 	__u32 faddr = inet->daddr;
122 	__u16 fport = inet->dport;
123 
124 	return tcp_hashfn(laddr, lport, faddr, fport);
125 }
126 
127 /* Allocate and initialize a new TCP local port bind bucket.
128  * The bindhash mutex for snum's hash chain must be held here.
129  */
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 					  unsigned short snum)
132 {
133 	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 						      SLAB_ATOMIC);
135 	if (tb) {
136 		tb->port = snum;
137 		tb->fastreuse = 0;
138 		INIT_HLIST_HEAD(&tb->owners);
139 		hlist_add_head(&tb->node, &head->chain);
140 	}
141 	return tb;
142 }
143 
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146 {
147 	if (hlist_empty(&tb->owners)) {
148 		__hlist_del(&tb->node);
149 		kmem_cache_free(tcp_bucket_cachep, tb);
150 	}
151 }
152 
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155 {
156 	struct tcp_bind_hashbucket *head =
157 				&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 	struct tcp_bind_bucket *tb;
159 
160 	spin_lock(&head->lock);
161 	tb = tcp_sk(sk)->bind_hash;
162 	sk_add_bind_node(child, &tb->owners);
163 	tcp_sk(child)->bind_hash = tb;
164 	spin_unlock(&head->lock);
165 }
166 
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168 {
169 	local_bh_disable();
170 	__tcp_inherit_port(sk, child);
171 	local_bh_enable();
172 }
173 
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 		   unsigned short snum)
176 {
177 	inet_sk(sk)->num = snum;
178 	sk_add_bind_node(sk, &tb->owners);
179 	tcp_sk(sk)->bind_hash = tb;
180 }
181 
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183 {
184 	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 	struct sock *sk2;
186 	struct hlist_node *node;
187 	int reuse = sk->sk_reuse;
188 
189 	sk_for_each_bound(sk2, node, &tb->owners) {
190 		if (sk != sk2 &&
191 		    !tcp_v6_ipv6only(sk2) &&
192 		    (!sk->sk_bound_dev_if ||
193 		     !sk2->sk_bound_dev_if ||
194 		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 			if (!reuse || !sk2->sk_reuse ||
196 			    sk2->sk_state == TCP_LISTEN) {
197 				const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 				    sk2_rcv_saddr == sk_rcv_saddr)
200 					break;
201 			}
202 		}
203 	}
204 	return node != NULL;
205 }
206 
207 /* Obtain a reference to a local port for the given sock,
208  * if snum is zero it means select any available local port.
209  */
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211 {
212 	struct tcp_bind_hashbucket *head;
213 	struct hlist_node *node;
214 	struct tcp_bind_bucket *tb;
215 	int ret;
216 
217 	local_bh_disable();
218 	if (!snum) {
219 		int low = sysctl_local_port_range[0];
220 		int high = sysctl_local_port_range[1];
221 		int remaining = (high - low) + 1;
222 		int rover;
223 
224 		spin_lock(&tcp_portalloc_lock);
225 		if (tcp_port_rover < low)
226 			rover = low;
227 		else
228 			rover = tcp_port_rover;
229 		do {
230 			rover++;
231 			if (rover > high)
232 				rover = low;
233 			head = &tcp_bhash[tcp_bhashfn(rover)];
234 			spin_lock(&head->lock);
235 			tb_for_each(tb, node, &head->chain)
236 				if (tb->port == rover)
237 					goto next;
238 			break;
239 		next:
240 			spin_unlock(&head->lock);
241 		} while (--remaining > 0);
242 		tcp_port_rover = rover;
243 		spin_unlock(&tcp_portalloc_lock);
244 
245 		/* Exhausted local port range during search? */
246 		ret = 1;
247 		if (remaining <= 0)
248 			goto fail;
249 
250 		/* OK, here is the one we will use.  HEAD is
251 		 * non-NULL and we hold it's mutex.
252 		 */
253 		snum = rover;
254 	} else {
255 		head = &tcp_bhash[tcp_bhashfn(snum)];
256 		spin_lock(&head->lock);
257 		tb_for_each(tb, node, &head->chain)
258 			if (tb->port == snum)
259 				goto tb_found;
260 	}
261 	tb = NULL;
262 	goto tb_not_found;
263 tb_found:
264 	if (!hlist_empty(&tb->owners)) {
265 		if (sk->sk_reuse > 1)
266 			goto success;
267 		if (tb->fastreuse > 0 &&
268 		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
269 			goto success;
270 		} else {
271 			ret = 1;
272 			if (tcp_bind_conflict(sk, tb))
273 				goto fail_unlock;
274 		}
275 	}
276 tb_not_found:
277 	ret = 1;
278 	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
279 		goto fail_unlock;
280 	if (hlist_empty(&tb->owners)) {
281 		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
282 			tb->fastreuse = 1;
283 		else
284 			tb->fastreuse = 0;
285 	} else if (tb->fastreuse &&
286 		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
287 		tb->fastreuse = 0;
288 success:
289 	if (!tcp_sk(sk)->bind_hash)
290 		tcp_bind_hash(sk, tb, snum);
291 	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
292  	ret = 0;
293 
294 fail_unlock:
295 	spin_unlock(&head->lock);
296 fail:
297 	local_bh_enable();
298 	return ret;
299 }
300 
301 /* Get rid of any references to a local port held by the
302  * given sock.
303  */
304 static void __tcp_put_port(struct sock *sk)
305 {
306 	struct inet_sock *inet = inet_sk(sk);
307 	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
308 	struct tcp_bind_bucket *tb;
309 
310 	spin_lock(&head->lock);
311 	tb = tcp_sk(sk)->bind_hash;
312 	__sk_del_bind_node(sk);
313 	tcp_sk(sk)->bind_hash = NULL;
314 	inet->num = 0;
315 	tcp_bucket_destroy(tb);
316 	spin_unlock(&head->lock);
317 }
318 
319 void tcp_put_port(struct sock *sk)
320 {
321 	local_bh_disable();
322 	__tcp_put_port(sk);
323 	local_bh_enable();
324 }
325 
326 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
327  * Look, when several writers sleep and reader wakes them up, all but one
328  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
329  * this, _but_ remember, it adds useless work on UP machines (wake up each
330  * exclusive lock release). It should be ifdefed really.
331  */
332 
333 void tcp_listen_wlock(void)
334 {
335 	write_lock(&tcp_lhash_lock);
336 
337 	if (atomic_read(&tcp_lhash_users)) {
338 		DEFINE_WAIT(wait);
339 
340 		for (;;) {
341 			prepare_to_wait_exclusive(&tcp_lhash_wait,
342 						&wait, TASK_UNINTERRUPTIBLE);
343 			if (!atomic_read(&tcp_lhash_users))
344 				break;
345 			write_unlock_bh(&tcp_lhash_lock);
346 			schedule();
347 			write_lock_bh(&tcp_lhash_lock);
348 		}
349 
350 		finish_wait(&tcp_lhash_wait, &wait);
351 	}
352 }
353 
354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355 {
356 	struct hlist_head *list;
357 	rwlock_t *lock;
358 
359 	BUG_TRAP(sk_unhashed(sk));
360 	if (listen_possible && sk->sk_state == TCP_LISTEN) {
361 		list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362 		lock = &tcp_lhash_lock;
363 		tcp_listen_wlock();
364 	} else {
365 		list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
366 		lock = &tcp_ehash[sk->sk_hashent].lock;
367 		write_lock(lock);
368 	}
369 	__sk_add_node(sk, list);
370 	sock_prot_inc_use(sk->sk_prot);
371 	write_unlock(lock);
372 	if (listen_possible && sk->sk_state == TCP_LISTEN)
373 		wake_up(&tcp_lhash_wait);
374 }
375 
376 static void tcp_v4_hash(struct sock *sk)
377 {
378 	if (sk->sk_state != TCP_CLOSE) {
379 		local_bh_disable();
380 		__tcp_v4_hash(sk, 1);
381 		local_bh_enable();
382 	}
383 }
384 
385 void tcp_unhash(struct sock *sk)
386 {
387 	rwlock_t *lock;
388 
389 	if (sk_unhashed(sk))
390 		goto ende;
391 
392 	if (sk->sk_state == TCP_LISTEN) {
393 		local_bh_disable();
394 		tcp_listen_wlock();
395 		lock = &tcp_lhash_lock;
396 	} else {
397 		struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
398 		lock = &head->lock;
399 		write_lock_bh(&head->lock);
400 	}
401 
402 	if (__sk_del_node_init(sk))
403 		sock_prot_dec_use(sk->sk_prot);
404 	write_unlock_bh(lock);
405 
406  ende:
407 	if (sk->sk_state == TCP_LISTEN)
408 		wake_up(&tcp_lhash_wait);
409 }
410 
411 /* Don't inline this cruft.  Here are some nice properties to
412  * exploit here.  The BSD API does not allow a listening TCP
413  * to specify the remote port nor the remote address for the
414  * connection.  So always assume those are both wildcarded
415  * during the search since they can never be otherwise.
416  */
417 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
418 					     unsigned short hnum, int dif)
419 {
420 	struct sock *result = NULL, *sk;
421 	struct hlist_node *node;
422 	int score, hiscore;
423 
424 	hiscore=-1;
425 	sk_for_each(sk, node, head) {
426 		struct inet_sock *inet = inet_sk(sk);
427 
428 		if (inet->num == hnum && !ipv6_only_sock(sk)) {
429 			__u32 rcv_saddr = inet->rcv_saddr;
430 
431 			score = (sk->sk_family == PF_INET ? 1 : 0);
432 			if (rcv_saddr) {
433 				if (rcv_saddr != daddr)
434 					continue;
435 				score+=2;
436 			}
437 			if (sk->sk_bound_dev_if) {
438 				if (sk->sk_bound_dev_if != dif)
439 					continue;
440 				score+=2;
441 			}
442 			if (score == 5)
443 				return sk;
444 			if (score > hiscore) {
445 				hiscore = score;
446 				result = sk;
447 			}
448 		}
449 	}
450 	return result;
451 }
452 
453 /* Optimize the common listener case. */
454 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
455 		unsigned short hnum, int dif)
456 {
457 	struct sock *sk = NULL;
458 	struct hlist_head *head;
459 
460 	read_lock(&tcp_lhash_lock);
461 	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
462 	if (!hlist_empty(head)) {
463 		struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
464 
465 		if (inet->num == hnum && !sk->sk_node.next &&
466 		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
467 		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
468 		    !sk->sk_bound_dev_if)
469 			goto sherry_cache;
470 		sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
471 	}
472 	if (sk) {
473 sherry_cache:
474 		sock_hold(sk);
475 	}
476 	read_unlock(&tcp_lhash_lock);
477 	return sk;
478 }
479 
480 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
481  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
482  *
483  * Local BH must be disabled here.
484  */
485 
486 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
487 						       u32 daddr, u16 hnum,
488 						       int dif)
489 {
490 	struct tcp_ehash_bucket *head;
491 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
492 	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
493 	struct sock *sk;
494 	struct hlist_node *node;
495 	/* Optimize here for direct hit, only listening connections can
496 	 * have wildcards anyways.
497 	 */
498 	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
499 	head = &tcp_ehash[hash];
500 	read_lock(&head->lock);
501 	sk_for_each(sk, node, &head->chain) {
502 		if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
503 			goto hit; /* You sunk my battleship! */
504 	}
505 
506 	/* Must check for a TIME_WAIT'er before going to listener hash. */
507 	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
508 		if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
509 			goto hit;
510 	}
511 	sk = NULL;
512 out:
513 	read_unlock(&head->lock);
514 	return sk;
515 hit:
516 	sock_hold(sk);
517 	goto out;
518 }
519 
520 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
521 					   u32 daddr, u16 hnum, int dif)
522 {
523 	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
524 						      daddr, hnum, dif);
525 
526 	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
527 }
528 
529 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
530 				  u16 dport, int dif)
531 {
532 	struct sock *sk;
533 
534 	local_bh_disable();
535 	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
536 	local_bh_enable();
537 
538 	return sk;
539 }
540 
541 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
542 
543 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
544 {
545 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
546 					  skb->nh.iph->saddr,
547 					  skb->h.th->dest,
548 					  skb->h.th->source);
549 }
550 
551 /* called with local bh disabled */
552 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
553 				      struct tcp_tw_bucket **twp)
554 {
555 	struct inet_sock *inet = inet_sk(sk);
556 	u32 daddr = inet->rcv_saddr;
557 	u32 saddr = inet->daddr;
558 	int dif = sk->sk_bound_dev_if;
559 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
560 	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
561 	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
562 	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
563 	struct sock *sk2;
564 	struct hlist_node *node;
565 	struct tcp_tw_bucket *tw;
566 
567 	write_lock(&head->lock);
568 
569 	/* Check TIME-WAIT sockets first. */
570 	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
571 		tw = (struct tcp_tw_bucket *)sk2;
572 
573 		if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
574 			struct tcp_sock *tp = tcp_sk(sk);
575 
576 			/* With PAWS, it is safe from the viewpoint
577 			   of data integrity. Even without PAWS it
578 			   is safe provided sequence spaces do not
579 			   overlap i.e. at data rates <= 80Mbit/sec.
580 
581 			   Actually, the idea is close to VJ's one,
582 			   only timestamp cache is held not per host,
583 			   but per port pair and TW bucket is used
584 			   as state holder.
585 
586 			   If TW bucket has been already destroyed we
587 			   fall back to VJ's scheme and use initial
588 			   timestamp retrieved from peer table.
589 			 */
590 			if (tw->tw_ts_recent_stamp &&
591 			    (!twp || (sysctl_tcp_tw_reuse &&
592 				      xtime.tv_sec -
593 				      tw->tw_ts_recent_stamp > 1))) {
594 				if ((tp->write_seq =
595 						tw->tw_snd_nxt + 65535 + 2) == 0)
596 					tp->write_seq = 1;
597 				tp->rx_opt.ts_recent	   = tw->tw_ts_recent;
598 				tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
599 				sock_hold(sk2);
600 				goto unique;
601 			} else
602 				goto not_unique;
603 		}
604 	}
605 	tw = NULL;
606 
607 	/* And established part... */
608 	sk_for_each(sk2, node, &head->chain) {
609 		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
610 			goto not_unique;
611 	}
612 
613 unique:
614 	/* Must record num and sport now. Otherwise we will see
615 	 * in hash table socket with a funny identity. */
616 	inet->num = lport;
617 	inet->sport = htons(lport);
618 	sk->sk_hashent = hash;
619 	BUG_TRAP(sk_unhashed(sk));
620 	__sk_add_node(sk, &head->chain);
621 	sock_prot_inc_use(sk->sk_prot);
622 	write_unlock(&head->lock);
623 
624 	if (twp) {
625 		*twp = tw;
626 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
627 	} else if (tw) {
628 		/* Silly. Should hash-dance instead... */
629 		tcp_tw_deschedule(tw);
630 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
631 
632 		tcp_tw_put(tw);
633 	}
634 
635 	return 0;
636 
637 not_unique:
638 	write_unlock(&head->lock);
639 	return -EADDRNOTAVAIL;
640 }
641 
642 static inline u32 connect_port_offset(const struct sock *sk)
643 {
644 	const struct inet_sock *inet = inet_sk(sk);
645 
646 	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
647 					 inet->dport);
648 }
649 
650 /*
651  * Bind a port for a connect operation and hash it.
652  */
653 static inline int tcp_v4_hash_connect(struct sock *sk)
654 {
655 	unsigned short snum = inet_sk(sk)->num;
656  	struct tcp_bind_hashbucket *head;
657  	struct tcp_bind_bucket *tb;
658 	int ret;
659 
660  	if (!snum) {
661  		int low = sysctl_local_port_range[0];
662  		int high = sysctl_local_port_range[1];
663 		int range = high - low;
664  		int i;
665 		int port;
666 		static u32 hint;
667 		u32 offset = hint + connect_port_offset(sk);
668 		struct hlist_node *node;
669  		struct tcp_tw_bucket *tw = NULL;
670 
671  		local_bh_disable();
672 		for (i = 1; i <= range; i++) {
673 			port = low + (i + offset) % range;
674  			head = &tcp_bhash[tcp_bhashfn(port)];
675  			spin_lock(&head->lock);
676 
677  			/* Does not bother with rcv_saddr checks,
678  			 * because the established check is already
679  			 * unique enough.
680  			 */
681 			tb_for_each(tb, node, &head->chain) {
682  				if (tb->port == port) {
683  					BUG_TRAP(!hlist_empty(&tb->owners));
684  					if (tb->fastreuse >= 0)
685  						goto next_port;
686  					if (!__tcp_v4_check_established(sk,
687 									port,
688 									&tw))
689  						goto ok;
690  					goto next_port;
691  				}
692  			}
693 
694  			tb = tcp_bucket_create(head, port);
695  			if (!tb) {
696  				spin_unlock(&head->lock);
697  				break;
698  			}
699  			tb->fastreuse = -1;
700  			goto ok;
701 
702  		next_port:
703  			spin_unlock(&head->lock);
704  		}
705  		local_bh_enable();
706 
707  		return -EADDRNOTAVAIL;
708 
709 ok:
710 		hint += i;
711 
712  		/* Head lock still held and bh's disabled */
713  		tcp_bind_hash(sk, tb, port);
714 		if (sk_unhashed(sk)) {
715  			inet_sk(sk)->sport = htons(port);
716  			__tcp_v4_hash(sk, 0);
717  		}
718  		spin_unlock(&head->lock);
719 
720  		if (tw) {
721  			tcp_tw_deschedule(tw);
722  			tcp_tw_put(tw);
723  		}
724 
725 		ret = 0;
726 		goto out;
727  	}
728 
729  	head  = &tcp_bhash[tcp_bhashfn(snum)];
730  	tb  = tcp_sk(sk)->bind_hash;
731 	spin_lock_bh(&head->lock);
732 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
733 		__tcp_v4_hash(sk, 0);
734 		spin_unlock_bh(&head->lock);
735 		return 0;
736 	} else {
737 		spin_unlock(&head->lock);
738 		/* No definite answer... Walk to established hash table */
739 		ret = __tcp_v4_check_established(sk, snum, NULL);
740 out:
741 		local_bh_enable();
742 		return ret;
743 	}
744 }
745 
746 /* This will initiate an outgoing connection. */
747 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
748 {
749 	struct inet_sock *inet = inet_sk(sk);
750 	struct tcp_sock *tp = tcp_sk(sk);
751 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
752 	struct rtable *rt;
753 	u32 daddr, nexthop;
754 	int tmp;
755 	int err;
756 
757 	if (addr_len < sizeof(struct sockaddr_in))
758 		return -EINVAL;
759 
760 	if (usin->sin_family != AF_INET)
761 		return -EAFNOSUPPORT;
762 
763 	nexthop = daddr = usin->sin_addr.s_addr;
764 	if (inet->opt && inet->opt->srr) {
765 		if (!daddr)
766 			return -EINVAL;
767 		nexthop = inet->opt->faddr;
768 	}
769 
770 	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
771 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
772 			       IPPROTO_TCP,
773 			       inet->sport, usin->sin_port, sk);
774 	if (tmp < 0)
775 		return tmp;
776 
777 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
778 		ip_rt_put(rt);
779 		return -ENETUNREACH;
780 	}
781 
782 	if (!inet->opt || !inet->opt->srr)
783 		daddr = rt->rt_dst;
784 
785 	if (!inet->saddr)
786 		inet->saddr = rt->rt_src;
787 	inet->rcv_saddr = inet->saddr;
788 
789 	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
790 		/* Reset inherited state */
791 		tp->rx_opt.ts_recent	   = 0;
792 		tp->rx_opt.ts_recent_stamp = 0;
793 		tp->write_seq		   = 0;
794 	}
795 
796 	if (sysctl_tcp_tw_recycle &&
797 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
798 		struct inet_peer *peer = rt_get_peer(rt);
799 
800 		/* VJ's idea. We save last timestamp seen from
801 		 * the destination in peer table, when entering state TIME-WAIT
802 		 * and initialize rx_opt.ts_recent from it, when trying new connection.
803 		 */
804 
805 		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
806 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
807 			tp->rx_opt.ts_recent = peer->tcp_ts;
808 		}
809 	}
810 
811 	inet->dport = usin->sin_port;
812 	inet->daddr = daddr;
813 
814 	tp->ext_header_len = 0;
815 	if (inet->opt)
816 		tp->ext_header_len = inet->opt->optlen;
817 
818 	tp->rx_opt.mss_clamp = 536;
819 
820 	/* Socket identity is still unknown (sport may be zero).
821 	 * However we set state to SYN-SENT and not releasing socket
822 	 * lock select source port, enter ourselves into the hash tables and
823 	 * complete initialization after this.
824 	 */
825 	tcp_set_state(sk, TCP_SYN_SENT);
826 	err = tcp_v4_hash_connect(sk);
827 	if (err)
828 		goto failure;
829 
830 	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
831 	if (err)
832 		goto failure;
833 
834 	/* OK, now commit destination to socket.  */
835 	__sk_dst_set(sk, &rt->u.dst);
836 	tcp_v4_setup_caps(sk, &rt->u.dst);
837 
838 	if (!tp->write_seq)
839 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
840 							   inet->daddr,
841 							   inet->sport,
842 							   usin->sin_port);
843 
844 	inet->id = tp->write_seq ^ jiffies;
845 
846 	err = tcp_connect(sk);
847 	rt = NULL;
848 	if (err)
849 		goto failure;
850 
851 	return 0;
852 
853 failure:
854 	/* This unhashes the socket and releases the local port, if necessary. */
855 	tcp_set_state(sk, TCP_CLOSE);
856 	ip_rt_put(rt);
857 	sk->sk_route_caps = 0;
858 	inet->dport = 0;
859 	return err;
860 }
861 
862 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
863 {
864 	return ((struct rtable *)skb->dst)->rt_iif;
865 }
866 
867 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
868 {
869 	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
870 }
871 
872 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
873 					      struct request_sock ***prevp,
874 					      __u16 rport,
875 					      __u32 raddr, __u32 laddr)
876 {
877 	struct listen_sock *lopt = tp->accept_queue.listen_opt;
878 	struct request_sock *req, **prev;
879 
880 	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
881 	     (req = *prev) != NULL;
882 	     prev = &req->dl_next) {
883 		const struct inet_request_sock *ireq = inet_rsk(req);
884 
885 		if (ireq->rmt_port == rport &&
886 		    ireq->rmt_addr == raddr &&
887 		    ireq->loc_addr == laddr &&
888 		    TCP_INET_FAMILY(req->rsk_ops->family)) {
889 			BUG_TRAP(!req->sk);
890 			*prevp = prev;
891 			break;
892 		}
893 	}
894 
895 	return req;
896 }
897 
898 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
899 {
900 	struct tcp_sock *tp = tcp_sk(sk);
901 	struct listen_sock *lopt = tp->accept_queue.listen_opt;
902 	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
903 
904 	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
905 	tcp_synq_added(sk);
906 }
907 
908 
909 /*
910  * This routine does path mtu discovery as defined in RFC1191.
911  */
912 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
913 				     u32 mtu)
914 {
915 	struct dst_entry *dst;
916 	struct inet_sock *inet = inet_sk(sk);
917 	struct tcp_sock *tp = tcp_sk(sk);
918 
919 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
920 	 * send out by Linux are always <576bytes so they should go through
921 	 * unfragmented).
922 	 */
923 	if (sk->sk_state == TCP_LISTEN)
924 		return;
925 
926 	/* We don't check in the destentry if pmtu discovery is forbidden
927 	 * on this route. We just assume that no packet_to_big packets
928 	 * are send back when pmtu discovery is not active.
929      	 * There is a small race when the user changes this flag in the
930 	 * route, but I think that's acceptable.
931 	 */
932 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
933 		return;
934 
935 	dst->ops->update_pmtu(dst, mtu);
936 
937 	/* Something is about to be wrong... Remember soft error
938 	 * for the case, if this connection will not able to recover.
939 	 */
940 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
941 		sk->sk_err_soft = EMSGSIZE;
942 
943 	mtu = dst_mtu(dst);
944 
945 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
946 	    tp->pmtu_cookie > mtu) {
947 		tcp_sync_mss(sk, mtu);
948 
949 		/* Resend the TCP packet because it's
950 		 * clear that the old packet has been
951 		 * dropped. This is the new "fast" path mtu
952 		 * discovery.
953 		 */
954 		tcp_simple_retransmit(sk);
955 	} /* else let the usual retransmit timer handle it */
956 }
957 
958 /*
959  * This routine is called by the ICMP module when it gets some
960  * sort of error condition.  If err < 0 then the socket should
961  * be closed and the error returned to the user.  If err > 0
962  * it's just the icmp type << 8 | icmp code.  After adjustment
963  * header points to the first 8 bytes of the tcp header.  We need
964  * to find the appropriate port.
965  *
966  * The locking strategy used here is very "optimistic". When
967  * someone else accesses the socket the ICMP is just dropped
968  * and for some paths there is no check at all.
969  * A more general error queue to queue errors for later handling
970  * is probably better.
971  *
972  */
973 
974 void tcp_v4_err(struct sk_buff *skb, u32 info)
975 {
976 	struct iphdr *iph = (struct iphdr *)skb->data;
977 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
978 	struct tcp_sock *tp;
979 	struct inet_sock *inet;
980 	int type = skb->h.icmph->type;
981 	int code = skb->h.icmph->code;
982 	struct sock *sk;
983 	__u32 seq;
984 	int err;
985 
986 	if (skb->len < (iph->ihl << 2) + 8) {
987 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
988 		return;
989 	}
990 
991 	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
992 			   th->source, tcp_v4_iif(skb));
993 	if (!sk) {
994 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
995 		return;
996 	}
997 	if (sk->sk_state == TCP_TIME_WAIT) {
998 		tcp_tw_put((struct tcp_tw_bucket *)sk);
999 		return;
1000 	}
1001 
1002 	bh_lock_sock(sk);
1003 	/* If too many ICMPs get dropped on busy
1004 	 * servers this needs to be solved differently.
1005 	 */
1006 	if (sock_owned_by_user(sk))
1007 		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1008 
1009 	if (sk->sk_state == TCP_CLOSE)
1010 		goto out;
1011 
1012 	tp = tcp_sk(sk);
1013 	seq = ntohl(th->seq);
1014 	if (sk->sk_state != TCP_LISTEN &&
1015 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
1016 		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1017 		goto out;
1018 	}
1019 
1020 	switch (type) {
1021 	case ICMP_SOURCE_QUENCH:
1022 		/* Just silently ignore these. */
1023 		goto out;
1024 	case ICMP_PARAMETERPROB:
1025 		err = EPROTO;
1026 		break;
1027 	case ICMP_DEST_UNREACH:
1028 		if (code > NR_ICMP_UNREACH)
1029 			goto out;
1030 
1031 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1032 			if (!sock_owned_by_user(sk))
1033 				do_pmtu_discovery(sk, iph, info);
1034 			goto out;
1035 		}
1036 
1037 		err = icmp_err_convert[code].errno;
1038 		break;
1039 	case ICMP_TIME_EXCEEDED:
1040 		err = EHOSTUNREACH;
1041 		break;
1042 	default:
1043 		goto out;
1044 	}
1045 
1046 	switch (sk->sk_state) {
1047 		struct request_sock *req, **prev;
1048 	case TCP_LISTEN:
1049 		if (sock_owned_by_user(sk))
1050 			goto out;
1051 
1052 		req = tcp_v4_search_req(tp, &prev, th->dest,
1053 					iph->daddr, iph->saddr);
1054 		if (!req)
1055 			goto out;
1056 
1057 		/* ICMPs are not backlogged, hence we cannot get
1058 		   an established socket here.
1059 		 */
1060 		BUG_TRAP(!req->sk);
1061 
1062 		if (seq != tcp_rsk(req)->snt_isn) {
1063 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1064 			goto out;
1065 		}
1066 
1067 		/*
1068 		 * Still in SYN_RECV, just remove it silently.
1069 		 * There is no good way to pass the error to the newly
1070 		 * created socket, and POSIX does not want network
1071 		 * errors returned from accept().
1072 		 */
1073 		tcp_synq_drop(sk, req, prev);
1074 		goto out;
1075 
1076 	case TCP_SYN_SENT:
1077 	case TCP_SYN_RECV:  /* Cannot happen.
1078 			       It can f.e. if SYNs crossed.
1079 			     */
1080 		if (!sock_owned_by_user(sk)) {
1081 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1082 			sk->sk_err = err;
1083 
1084 			sk->sk_error_report(sk);
1085 
1086 			tcp_done(sk);
1087 		} else {
1088 			sk->sk_err_soft = err;
1089 		}
1090 		goto out;
1091 	}
1092 
1093 	/* If we've already connected we will keep trying
1094 	 * until we time out, or the user gives up.
1095 	 *
1096 	 * rfc1122 4.2.3.9 allows to consider as hard errors
1097 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1098 	 * but it is obsoleted by pmtu discovery).
1099 	 *
1100 	 * Note, that in modern internet, where routing is unreliable
1101 	 * and in each dark corner broken firewalls sit, sending random
1102 	 * errors ordered by their masters even this two messages finally lose
1103 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
1104 	 *
1105 	 * Now we are in compliance with RFCs.
1106 	 *							--ANK (980905)
1107 	 */
1108 
1109 	inet = inet_sk(sk);
1110 	if (!sock_owned_by_user(sk) && inet->recverr) {
1111 		sk->sk_err = err;
1112 		sk->sk_error_report(sk);
1113 	} else	{ /* Only an error on timeout */
1114 		sk->sk_err_soft = err;
1115 	}
1116 
1117 out:
1118 	bh_unlock_sock(sk);
1119 	sock_put(sk);
1120 }
1121 
1122 /* This routine computes an IPv4 TCP checksum. */
1123 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1124 		       struct sk_buff *skb)
1125 {
1126 	struct inet_sock *inet = inet_sk(sk);
1127 
1128 	if (skb->ip_summed == CHECKSUM_HW) {
1129 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1130 		skb->csum = offsetof(struct tcphdr, check);
1131 	} else {
1132 		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1133 					 csum_partial((char *)th,
1134 						      th->doff << 2,
1135 						      skb->csum));
1136 	}
1137 }
1138 
1139 /*
1140  *	This routine will send an RST to the other tcp.
1141  *
1142  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1143  *		      for reset.
1144  *	Answer: if a packet caused RST, it is not for a socket
1145  *		existing in our system, if it is matched to a socket,
1146  *		it is just duplicate segment or bug in other side's TCP.
1147  *		So that we build reply only basing on parameters
1148  *		arrived with segment.
1149  *	Exception: precedence violation. We do not implement it in any case.
1150  */
1151 
1152 static void tcp_v4_send_reset(struct sk_buff *skb)
1153 {
1154 	struct tcphdr *th = skb->h.th;
1155 	struct tcphdr rth;
1156 	struct ip_reply_arg arg;
1157 
1158 	/* Never send a reset in response to a reset. */
1159 	if (th->rst)
1160 		return;
1161 
1162 	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1163 		return;
1164 
1165 	/* Swap the send and the receive. */
1166 	memset(&rth, 0, sizeof(struct tcphdr));
1167 	rth.dest   = th->source;
1168 	rth.source = th->dest;
1169 	rth.doff   = sizeof(struct tcphdr) / 4;
1170 	rth.rst    = 1;
1171 
1172 	if (th->ack) {
1173 		rth.seq = th->ack_seq;
1174 	} else {
1175 		rth.ack = 1;
1176 		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1177 				    skb->len - (th->doff << 2));
1178 	}
1179 
1180 	memset(&arg, 0, sizeof arg);
1181 	arg.iov[0].iov_base = (unsigned char *)&rth;
1182 	arg.iov[0].iov_len  = sizeof rth;
1183 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1184 				      skb->nh.iph->saddr, /*XXX*/
1185 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
1186 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1187 
1188 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1189 
1190 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1191 	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1192 }
1193 
1194 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1195    outside socket context is ugly, certainly. What can I do?
1196  */
1197 
1198 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1199 			    u32 win, u32 ts)
1200 {
1201 	struct tcphdr *th = skb->h.th;
1202 	struct {
1203 		struct tcphdr th;
1204 		u32 tsopt[3];
1205 	} rep;
1206 	struct ip_reply_arg arg;
1207 
1208 	memset(&rep.th, 0, sizeof(struct tcphdr));
1209 	memset(&arg, 0, sizeof arg);
1210 
1211 	arg.iov[0].iov_base = (unsigned char *)&rep;
1212 	arg.iov[0].iov_len  = sizeof(rep.th);
1213 	if (ts) {
1214 		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1215 				     (TCPOPT_TIMESTAMP << 8) |
1216 				     TCPOLEN_TIMESTAMP);
1217 		rep.tsopt[1] = htonl(tcp_time_stamp);
1218 		rep.tsopt[2] = htonl(ts);
1219 		arg.iov[0].iov_len = sizeof(rep);
1220 	}
1221 
1222 	/* Swap the send and the receive. */
1223 	rep.th.dest    = th->source;
1224 	rep.th.source  = th->dest;
1225 	rep.th.doff    = arg.iov[0].iov_len / 4;
1226 	rep.th.seq     = htonl(seq);
1227 	rep.th.ack_seq = htonl(ack);
1228 	rep.th.ack     = 1;
1229 	rep.th.window  = htons(win);
1230 
1231 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1232 				      skb->nh.iph->saddr, /*XXX*/
1233 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1234 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1235 
1236 	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1237 
1238 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1239 }
1240 
1241 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1242 {
1243 	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1244 
1245 	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1246 			tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1247 
1248 	tcp_tw_put(tw);
1249 }
1250 
1251 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1252 {
1253 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1254 			req->ts_recent);
1255 }
1256 
1257 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1258 					  struct request_sock *req)
1259 {
1260 	struct rtable *rt;
1261 	const struct inet_request_sock *ireq = inet_rsk(req);
1262 	struct ip_options *opt = inet_rsk(req)->opt;
1263 	struct flowi fl = { .oif = sk->sk_bound_dev_if,
1264 			    .nl_u = { .ip4_u =
1265 				      { .daddr = ((opt && opt->srr) ?
1266 						  opt->faddr :
1267 						  ireq->rmt_addr),
1268 					.saddr = ireq->loc_addr,
1269 					.tos = RT_CONN_FLAGS(sk) } },
1270 			    .proto = IPPROTO_TCP,
1271 			    .uli_u = { .ports =
1272 				       { .sport = inet_sk(sk)->sport,
1273 					 .dport = ireq->rmt_port } } };
1274 
1275 	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1276 		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1277 		return NULL;
1278 	}
1279 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1280 		ip_rt_put(rt);
1281 		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1282 		return NULL;
1283 	}
1284 	return &rt->u.dst;
1285 }
1286 
1287 /*
1288  *	Send a SYN-ACK after having received an ACK.
1289  *	This still operates on a request_sock only, not on a big
1290  *	socket.
1291  */
1292 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1293 			      struct dst_entry *dst)
1294 {
1295 	const struct inet_request_sock *ireq = inet_rsk(req);
1296 	int err = -1;
1297 	struct sk_buff * skb;
1298 
1299 	/* First, grab a route. */
1300 	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1301 		goto out;
1302 
1303 	skb = tcp_make_synack(sk, dst, req);
1304 
1305 	if (skb) {
1306 		struct tcphdr *th = skb->h.th;
1307 
1308 		th->check = tcp_v4_check(th, skb->len,
1309 					 ireq->loc_addr,
1310 					 ireq->rmt_addr,
1311 					 csum_partial((char *)th, skb->len,
1312 						      skb->csum));
1313 
1314 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1315 					    ireq->rmt_addr,
1316 					    ireq->opt);
1317 		if (err == NET_XMIT_CN)
1318 			err = 0;
1319 	}
1320 
1321 out:
1322 	dst_release(dst);
1323 	return err;
1324 }
1325 
1326 /*
1327  *	IPv4 request_sock destructor.
1328  */
1329 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1330 {
1331 	if (inet_rsk(req)->opt)
1332 		kfree(inet_rsk(req)->opt);
1333 }
1334 
1335 static inline void syn_flood_warning(struct sk_buff *skb)
1336 {
1337 	static unsigned long warntime;
1338 
1339 	if (time_after(jiffies, (warntime + HZ * 60))) {
1340 		warntime = jiffies;
1341 		printk(KERN_INFO
1342 		       "possible SYN flooding on port %d. Sending cookies.\n",
1343 		       ntohs(skb->h.th->dest));
1344 	}
1345 }
1346 
1347 /*
1348  * Save and compile IPv4 options into the request_sock if needed.
1349  */
1350 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1351 						     struct sk_buff *skb)
1352 {
1353 	struct ip_options *opt = &(IPCB(skb)->opt);
1354 	struct ip_options *dopt = NULL;
1355 
1356 	if (opt && opt->optlen) {
1357 		int opt_size = optlength(opt);
1358 		dopt = kmalloc(opt_size, GFP_ATOMIC);
1359 		if (dopt) {
1360 			if (ip_options_echo(dopt, skb)) {
1361 				kfree(dopt);
1362 				dopt = NULL;
1363 			}
1364 		}
1365 	}
1366 	return dopt;
1367 }
1368 
1369 struct request_sock_ops tcp_request_sock_ops = {
1370 	.family		=	PF_INET,
1371 	.obj_size	=	sizeof(struct tcp_request_sock),
1372 	.rtx_syn_ack	=	tcp_v4_send_synack,
1373 	.send_ack	=	tcp_v4_reqsk_send_ack,
1374 	.destructor	=	tcp_v4_reqsk_destructor,
1375 	.send_reset	=	tcp_v4_send_reset,
1376 };
1377 
1378 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1379 {
1380 	struct inet_request_sock *ireq;
1381 	struct tcp_options_received tmp_opt;
1382 	struct request_sock *req;
1383 	__u32 saddr = skb->nh.iph->saddr;
1384 	__u32 daddr = skb->nh.iph->daddr;
1385 	__u32 isn = TCP_SKB_CB(skb)->when;
1386 	struct dst_entry *dst = NULL;
1387 #ifdef CONFIG_SYN_COOKIES
1388 	int want_cookie = 0;
1389 #else
1390 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1391 #endif
1392 
1393 	/* Never answer to SYNs send to broadcast or multicast */
1394 	if (((struct rtable *)skb->dst)->rt_flags &
1395 	    (RTCF_BROADCAST | RTCF_MULTICAST))
1396 		goto drop;
1397 
1398 	/* TW buckets are converted to open requests without
1399 	 * limitations, they conserve resources and peer is
1400 	 * evidently real one.
1401 	 */
1402 	if (tcp_synq_is_full(sk) && !isn) {
1403 #ifdef CONFIG_SYN_COOKIES
1404 		if (sysctl_tcp_syncookies) {
1405 			want_cookie = 1;
1406 		} else
1407 #endif
1408 		goto drop;
1409 	}
1410 
1411 	/* Accept backlog is full. If we have already queued enough
1412 	 * of warm entries in syn queue, drop request. It is better than
1413 	 * clogging syn queue with openreqs with exponentially increasing
1414 	 * timeout.
1415 	 */
1416 	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1417 		goto drop;
1418 
1419 	req = reqsk_alloc(&tcp_request_sock_ops);
1420 	if (!req)
1421 		goto drop;
1422 
1423 	tcp_clear_options(&tmp_opt);
1424 	tmp_opt.mss_clamp = 536;
1425 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1426 
1427 	tcp_parse_options(skb, &tmp_opt, 0);
1428 
1429 	if (want_cookie) {
1430 		tcp_clear_options(&tmp_opt);
1431 		tmp_opt.saw_tstamp = 0;
1432 	}
1433 
1434 	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1435 		/* Some OSes (unknown ones, but I see them on web server, which
1436 		 * contains information interesting only for windows'
1437 		 * users) do not send their stamp in SYN. It is easy case.
1438 		 * We simply do not advertise TS support.
1439 		 */
1440 		tmp_opt.saw_tstamp = 0;
1441 		tmp_opt.tstamp_ok  = 0;
1442 	}
1443 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1444 
1445 	tcp_openreq_init(req, &tmp_opt, skb);
1446 
1447 	ireq = inet_rsk(req);
1448 	ireq->loc_addr = daddr;
1449 	ireq->rmt_addr = saddr;
1450 	ireq->opt = tcp_v4_save_options(sk, skb);
1451 	if (!want_cookie)
1452 		TCP_ECN_create_request(req, skb->h.th);
1453 
1454 	if (want_cookie) {
1455 #ifdef CONFIG_SYN_COOKIES
1456 		syn_flood_warning(skb);
1457 #endif
1458 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1459 	} else if (!isn) {
1460 		struct inet_peer *peer = NULL;
1461 
1462 		/* VJ's idea. We save last timestamp seen
1463 		 * from the destination in peer table, when entering
1464 		 * state TIME-WAIT, and check against it before
1465 		 * accepting new connection request.
1466 		 *
1467 		 * If "isn" is not zero, this request hit alive
1468 		 * timewait bucket, so that all the necessary checks
1469 		 * are made in the function processing timewait state.
1470 		 */
1471 		if (tmp_opt.saw_tstamp &&
1472 		    sysctl_tcp_tw_recycle &&
1473 		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1474 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1475 		    peer->v4daddr == saddr) {
1476 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1477 			    (s32)(peer->tcp_ts - req->ts_recent) >
1478 							TCP_PAWS_WINDOW) {
1479 				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1480 				dst_release(dst);
1481 				goto drop_and_free;
1482 			}
1483 		}
1484 		/* Kill the following clause, if you dislike this way. */
1485 		else if (!sysctl_tcp_syncookies &&
1486 			 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1487 			  (sysctl_max_syn_backlog >> 2)) &&
1488 			 (!peer || !peer->tcp_ts_stamp) &&
1489 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1490 			/* Without syncookies last quarter of
1491 			 * backlog is filled with destinations,
1492 			 * proven to be alive.
1493 			 * It means that we continue to communicate
1494 			 * to destinations, already remembered
1495 			 * to the moment of synflood.
1496 			 */
1497 			NETDEBUG(if (net_ratelimit()) \
1498 					printk(KERN_DEBUG "TCP: drop open "
1499 							  "request from %u.%u."
1500 							  "%u.%u/%u\n", \
1501 					       NIPQUAD(saddr),
1502 					       ntohs(skb->h.th->source)));
1503 			dst_release(dst);
1504 			goto drop_and_free;
1505 		}
1506 
1507 		isn = tcp_v4_init_sequence(sk, skb);
1508 	}
1509 	tcp_rsk(req)->snt_isn = isn;
1510 
1511 	if (tcp_v4_send_synack(sk, req, dst))
1512 		goto drop_and_free;
1513 
1514 	if (want_cookie) {
1515 	   	reqsk_free(req);
1516 	} else {
1517 		tcp_v4_synq_add(sk, req);
1518 	}
1519 	return 0;
1520 
1521 drop_and_free:
1522 	reqsk_free(req);
1523 drop:
1524 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1525 	return 0;
1526 }
1527 
1528 
1529 /*
1530  * The three way handshake has completed - we got a valid synack -
1531  * now create the new socket.
1532  */
1533 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1534 				  struct request_sock *req,
1535 				  struct dst_entry *dst)
1536 {
1537 	struct inet_request_sock *ireq;
1538 	struct inet_sock *newinet;
1539 	struct tcp_sock *newtp;
1540 	struct sock *newsk;
1541 
1542 	if (sk_acceptq_is_full(sk))
1543 		goto exit_overflow;
1544 
1545 	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1546 		goto exit;
1547 
1548 	newsk = tcp_create_openreq_child(sk, req, skb);
1549 	if (!newsk)
1550 		goto exit;
1551 
1552 	newsk->sk_dst_cache = dst;
1553 	tcp_v4_setup_caps(newsk, dst);
1554 
1555 	newtp		      = tcp_sk(newsk);
1556 	newinet		      = inet_sk(newsk);
1557 	ireq		      = inet_rsk(req);
1558 	newinet->daddr	      = ireq->rmt_addr;
1559 	newinet->rcv_saddr    = ireq->loc_addr;
1560 	newinet->saddr	      = ireq->loc_addr;
1561 	newinet->opt	      = ireq->opt;
1562 	ireq->opt	      = NULL;
1563 	newinet->mc_index     = tcp_v4_iif(skb);
1564 	newinet->mc_ttl	      = skb->nh.iph->ttl;
1565 	newtp->ext_header_len = 0;
1566 	if (newinet->opt)
1567 		newtp->ext_header_len = newinet->opt->optlen;
1568 	newinet->id = newtp->write_seq ^ jiffies;
1569 
1570 	tcp_sync_mss(newsk, dst_mtu(dst));
1571 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1572 	tcp_initialize_rcv_mss(newsk);
1573 
1574 	__tcp_v4_hash(newsk, 0);
1575 	__tcp_inherit_port(sk, newsk);
1576 
1577 	return newsk;
1578 
1579 exit_overflow:
1580 	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1581 exit:
1582 	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1583 	dst_release(dst);
1584 	return NULL;
1585 }
1586 
1587 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1588 {
1589 	struct tcphdr *th = skb->h.th;
1590 	struct iphdr *iph = skb->nh.iph;
1591 	struct tcp_sock *tp = tcp_sk(sk);
1592 	struct sock *nsk;
1593 	struct request_sock **prev;
1594 	/* Find possible connection requests. */
1595 	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1596 						     iph->saddr, iph->daddr);
1597 	if (req)
1598 		return tcp_check_req(sk, skb, req, prev);
1599 
1600 	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1601 					  th->source,
1602 					  skb->nh.iph->daddr,
1603 					  ntohs(th->dest),
1604 					  tcp_v4_iif(skb));
1605 
1606 	if (nsk) {
1607 		if (nsk->sk_state != TCP_TIME_WAIT) {
1608 			bh_lock_sock(nsk);
1609 			return nsk;
1610 		}
1611 		tcp_tw_put((struct tcp_tw_bucket *)nsk);
1612 		return NULL;
1613 	}
1614 
1615 #ifdef CONFIG_SYN_COOKIES
1616 	if (!th->rst && !th->syn && th->ack)
1617 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1618 #endif
1619 	return sk;
1620 }
1621 
1622 static int tcp_v4_checksum_init(struct sk_buff *skb)
1623 {
1624 	if (skb->ip_summed == CHECKSUM_HW) {
1625 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1626 		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1627 				  skb->nh.iph->daddr, skb->csum))
1628 			return 0;
1629 
1630 		NETDEBUG(if (net_ratelimit())
1631 				printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1632 		skb->ip_summed = CHECKSUM_NONE;
1633 	}
1634 	if (skb->len <= 76) {
1635 		if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1636 				 skb->nh.iph->daddr,
1637 				 skb_checksum(skb, 0, skb->len, 0)))
1638 			return -1;
1639 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1640 	} else {
1641 		skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1642 					  skb->nh.iph->saddr,
1643 					  skb->nh.iph->daddr, 0);
1644 	}
1645 	return 0;
1646 }
1647 
1648 
1649 /* The socket must have it's spinlock held when we get
1650  * here.
1651  *
1652  * We have a potential double-lock case here, so even when
1653  * doing backlog processing we use the BH locking scheme.
1654  * This is because we cannot sleep with the original spinlock
1655  * held.
1656  */
1657 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1658 {
1659 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1660 		TCP_CHECK_TIMER(sk);
1661 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1662 			goto reset;
1663 		TCP_CHECK_TIMER(sk);
1664 		return 0;
1665 	}
1666 
1667 	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1668 		goto csum_err;
1669 
1670 	if (sk->sk_state == TCP_LISTEN) {
1671 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1672 		if (!nsk)
1673 			goto discard;
1674 
1675 		if (nsk != sk) {
1676 			if (tcp_child_process(sk, nsk, skb))
1677 				goto reset;
1678 			return 0;
1679 		}
1680 	}
1681 
1682 	TCP_CHECK_TIMER(sk);
1683 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1684 		goto reset;
1685 	TCP_CHECK_TIMER(sk);
1686 	return 0;
1687 
1688 reset:
1689 	tcp_v4_send_reset(skb);
1690 discard:
1691 	kfree_skb(skb);
1692 	/* Be careful here. If this function gets more complicated and
1693 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1694 	 * might be destroyed here. This current version compiles correctly,
1695 	 * but you have been warned.
1696 	 */
1697 	return 0;
1698 
1699 csum_err:
1700 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1701 	goto discard;
1702 }
1703 
1704 /*
1705  *	From tcp_input.c
1706  */
1707 
1708 int tcp_v4_rcv(struct sk_buff *skb)
1709 {
1710 	struct tcphdr *th;
1711 	struct sock *sk;
1712 	int ret;
1713 
1714 	if (skb->pkt_type != PACKET_HOST)
1715 		goto discard_it;
1716 
1717 	/* Count it even if it's bad */
1718 	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1719 
1720 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1721 		goto discard_it;
1722 
1723 	th = skb->h.th;
1724 
1725 	if (th->doff < sizeof(struct tcphdr) / 4)
1726 		goto bad_packet;
1727 	if (!pskb_may_pull(skb, th->doff * 4))
1728 		goto discard_it;
1729 
1730 	/* An explanation is required here, I think.
1731 	 * Packet length and doff are validated by header prediction,
1732 	 * provided case of th->doff==0 is elimineted.
1733 	 * So, we defer the checks. */
1734 	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1735 	     tcp_v4_checksum_init(skb) < 0))
1736 		goto bad_packet;
1737 
1738 	th = skb->h.th;
1739 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1740 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1741 				    skb->len - th->doff * 4);
1742 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1743 	TCP_SKB_CB(skb)->when	 = 0;
1744 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1745 	TCP_SKB_CB(skb)->sacked	 = 0;
1746 
1747 	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1748 			     skb->nh.iph->daddr, ntohs(th->dest),
1749 			     tcp_v4_iif(skb));
1750 
1751 	if (!sk)
1752 		goto no_tcp_socket;
1753 
1754 process:
1755 	if (sk->sk_state == TCP_TIME_WAIT)
1756 		goto do_time_wait;
1757 
1758 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1759 		goto discard_and_relse;
1760 
1761 	if (sk_filter(sk, skb, 0))
1762 		goto discard_and_relse;
1763 
1764 	skb->dev = NULL;
1765 
1766 	bh_lock_sock(sk);
1767 	ret = 0;
1768 	if (!sock_owned_by_user(sk)) {
1769 		if (!tcp_prequeue(sk, skb))
1770 			ret = tcp_v4_do_rcv(sk, skb);
1771 	} else
1772 		sk_add_backlog(sk, skb);
1773 	bh_unlock_sock(sk);
1774 
1775 	sock_put(sk);
1776 
1777 	return ret;
1778 
1779 no_tcp_socket:
1780 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1781 		goto discard_it;
1782 
1783 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1784 bad_packet:
1785 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1786 	} else {
1787 		tcp_v4_send_reset(skb);
1788 	}
1789 
1790 discard_it:
1791 	/* Discard frame. */
1792 	kfree_skb(skb);
1793   	return 0;
1794 
1795 discard_and_relse:
1796 	sock_put(sk);
1797 	goto discard_it;
1798 
1799 do_time_wait:
1800 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1801 		tcp_tw_put((struct tcp_tw_bucket *) sk);
1802 		goto discard_it;
1803 	}
1804 
1805 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1806 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1807 		tcp_tw_put((struct tcp_tw_bucket *) sk);
1808 		goto discard_it;
1809 	}
1810 	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1811 					   skb, th, skb->len)) {
1812 	case TCP_TW_SYN: {
1813 		struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1814 							  ntohs(th->dest),
1815 							  tcp_v4_iif(skb));
1816 		if (sk2) {
1817 			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1818 			tcp_tw_put((struct tcp_tw_bucket *)sk);
1819 			sk = sk2;
1820 			goto process;
1821 		}
1822 		/* Fall through to ACK */
1823 	}
1824 	case TCP_TW_ACK:
1825 		tcp_v4_timewait_ack(sk, skb);
1826 		break;
1827 	case TCP_TW_RST:
1828 		goto no_tcp_socket;
1829 	case TCP_TW_SUCCESS:;
1830 	}
1831 	goto discard_it;
1832 }
1833 
1834 /* With per-bucket locks this operation is not-atomic, so that
1835  * this version is not worse.
1836  */
1837 static void __tcp_v4_rehash(struct sock *sk)
1838 {
1839 	sk->sk_prot->unhash(sk);
1840 	sk->sk_prot->hash(sk);
1841 }
1842 
1843 static int tcp_v4_reselect_saddr(struct sock *sk)
1844 {
1845 	struct inet_sock *inet = inet_sk(sk);
1846 	int err;
1847 	struct rtable *rt;
1848 	__u32 old_saddr = inet->saddr;
1849 	__u32 new_saddr;
1850 	__u32 daddr = inet->daddr;
1851 
1852 	if (inet->opt && inet->opt->srr)
1853 		daddr = inet->opt->faddr;
1854 
1855 	/* Query new route. */
1856 	err = ip_route_connect(&rt, daddr, 0,
1857 			       RT_CONN_FLAGS(sk),
1858 			       sk->sk_bound_dev_if,
1859 			       IPPROTO_TCP,
1860 			       inet->sport, inet->dport, sk);
1861 	if (err)
1862 		return err;
1863 
1864 	__sk_dst_set(sk, &rt->u.dst);
1865 	tcp_v4_setup_caps(sk, &rt->u.dst);
1866 
1867 	new_saddr = rt->rt_src;
1868 
1869 	if (new_saddr == old_saddr)
1870 		return 0;
1871 
1872 	if (sysctl_ip_dynaddr > 1) {
1873 		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1874 				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1875 		       NIPQUAD(old_saddr),
1876 		       NIPQUAD(new_saddr));
1877 	}
1878 
1879 	inet->saddr = new_saddr;
1880 	inet->rcv_saddr = new_saddr;
1881 
1882 	/* XXX The only one ugly spot where we need to
1883 	 * XXX really change the sockets identity after
1884 	 * XXX it has entered the hashes. -DaveM
1885 	 *
1886 	 * Besides that, it does not check for connection
1887 	 * uniqueness. Wait for troubles.
1888 	 */
1889 	__tcp_v4_rehash(sk);
1890 	return 0;
1891 }
1892 
1893 int tcp_v4_rebuild_header(struct sock *sk)
1894 {
1895 	struct inet_sock *inet = inet_sk(sk);
1896 	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1897 	u32 daddr;
1898 	int err;
1899 
1900 	/* Route is OK, nothing to do. */
1901 	if (rt)
1902 		return 0;
1903 
1904 	/* Reroute. */
1905 	daddr = inet->daddr;
1906 	if (inet->opt && inet->opt->srr)
1907 		daddr = inet->opt->faddr;
1908 
1909 	{
1910 		struct flowi fl = { .oif = sk->sk_bound_dev_if,
1911 				    .nl_u = { .ip4_u =
1912 					      { .daddr = daddr,
1913 						.saddr = inet->saddr,
1914 						.tos = RT_CONN_FLAGS(sk) } },
1915 				    .proto = IPPROTO_TCP,
1916 				    .uli_u = { .ports =
1917 					       { .sport = inet->sport,
1918 						 .dport = inet->dport } } };
1919 
1920 		err = ip_route_output_flow(&rt, &fl, sk, 0);
1921 	}
1922 	if (!err) {
1923 		__sk_dst_set(sk, &rt->u.dst);
1924 		tcp_v4_setup_caps(sk, &rt->u.dst);
1925 		return 0;
1926 	}
1927 
1928 	/* Routing failed... */
1929 	sk->sk_route_caps = 0;
1930 
1931 	if (!sysctl_ip_dynaddr ||
1932 	    sk->sk_state != TCP_SYN_SENT ||
1933 	    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1934 	    (err = tcp_v4_reselect_saddr(sk)) != 0)
1935 		sk->sk_err_soft = -err;
1936 
1937 	return err;
1938 }
1939 
1940 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1941 {
1942 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1943 	struct inet_sock *inet = inet_sk(sk);
1944 
1945 	sin->sin_family		= AF_INET;
1946 	sin->sin_addr.s_addr	= inet->daddr;
1947 	sin->sin_port		= inet->dport;
1948 }
1949 
1950 /* VJ's idea. Save last timestamp seen from this destination
1951  * and hold it at least for normal timewait interval to use for duplicate
1952  * segment detection in subsequent connections, before they enter synchronized
1953  * state.
1954  */
1955 
1956 int tcp_v4_remember_stamp(struct sock *sk)
1957 {
1958 	struct inet_sock *inet = inet_sk(sk);
1959 	struct tcp_sock *tp = tcp_sk(sk);
1960 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1961 	struct inet_peer *peer = NULL;
1962 	int release_it = 0;
1963 
1964 	if (!rt || rt->rt_dst != inet->daddr) {
1965 		peer = inet_getpeer(inet->daddr, 1);
1966 		release_it = 1;
1967 	} else {
1968 		if (!rt->peer)
1969 			rt_bind_peer(rt, 1);
1970 		peer = rt->peer;
1971 	}
1972 
1973 	if (peer) {
1974 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1975 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1976 		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1977 			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1978 			peer->tcp_ts = tp->rx_opt.ts_recent;
1979 		}
1980 		if (release_it)
1981 			inet_putpeer(peer);
1982 		return 1;
1983 	}
1984 
1985 	return 0;
1986 }
1987 
1988 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1989 {
1990 	struct inet_peer *peer = NULL;
1991 
1992 	peer = inet_getpeer(tw->tw_daddr, 1);
1993 
1994 	if (peer) {
1995 		if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1996 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1997 		     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1998 			peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1999 			peer->tcp_ts = tw->tw_ts_recent;
2000 		}
2001 		inet_putpeer(peer);
2002 		return 1;
2003 	}
2004 
2005 	return 0;
2006 }
2007 
2008 struct tcp_func ipv4_specific = {
2009 	.queue_xmit	=	ip_queue_xmit,
2010 	.send_check	=	tcp_v4_send_check,
2011 	.rebuild_header	=	tcp_v4_rebuild_header,
2012 	.conn_request	=	tcp_v4_conn_request,
2013 	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
2014 	.remember_stamp	=	tcp_v4_remember_stamp,
2015 	.net_header_len	=	sizeof(struct iphdr),
2016 	.setsockopt	=	ip_setsockopt,
2017 	.getsockopt	=	ip_getsockopt,
2018 	.addr2sockaddr	=	v4_addr2sockaddr,
2019 	.sockaddr_len	=	sizeof(struct sockaddr_in),
2020 };
2021 
2022 /* NOTE: A lot of things set to zero explicitly by call to
2023  *       sk_alloc() so need not be done here.
2024  */
2025 static int tcp_v4_init_sock(struct sock *sk)
2026 {
2027 	struct tcp_sock *tp = tcp_sk(sk);
2028 
2029 	skb_queue_head_init(&tp->out_of_order_queue);
2030 	tcp_init_xmit_timers(sk);
2031 	tcp_prequeue_init(tp);
2032 
2033 	tp->rto  = TCP_TIMEOUT_INIT;
2034 	tp->mdev = TCP_TIMEOUT_INIT;
2035 
2036 	/* So many TCP implementations out there (incorrectly) count the
2037 	 * initial SYN frame in their delayed-ACK and congestion control
2038 	 * algorithms that we must have the following bandaid to talk
2039 	 * efficiently to them.  -DaveM
2040 	 */
2041 	tp->snd_cwnd = 2;
2042 
2043 	/* See draft-stevens-tcpca-spec-01 for discussion of the
2044 	 * initialization of these values.
2045 	 */
2046 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
2047 	tp->snd_cwnd_clamp = ~0;
2048 	tp->mss_cache = 536;
2049 
2050 	tp->reordering = sysctl_tcp_reordering;
2051 	tp->ca_ops = &tcp_init_congestion_ops;
2052 
2053 	sk->sk_state = TCP_CLOSE;
2054 
2055 	sk->sk_write_space = sk_stream_write_space;
2056 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2057 
2058 	tp->af_specific = &ipv4_specific;
2059 
2060 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
2061 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2062 
2063 	atomic_inc(&tcp_sockets_allocated);
2064 
2065 	return 0;
2066 }
2067 
2068 int tcp_v4_destroy_sock(struct sock *sk)
2069 {
2070 	struct tcp_sock *tp = tcp_sk(sk);
2071 
2072 	tcp_clear_xmit_timers(sk);
2073 
2074 	tcp_cleanup_congestion_control(tp);
2075 
2076 	/* Cleanup up the write buffer. */
2077   	sk_stream_writequeue_purge(sk);
2078 
2079 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2080   	__skb_queue_purge(&tp->out_of_order_queue);
2081 
2082 	/* Clean prequeue, it must be empty really */
2083 	__skb_queue_purge(&tp->ucopy.prequeue);
2084 
2085 	/* Clean up a referenced TCP bind bucket. */
2086 	if (tp->bind_hash)
2087 		tcp_put_port(sk);
2088 
2089 	/*
2090 	 * If sendmsg cached page exists, toss it.
2091 	 */
2092 	if (sk->sk_sndmsg_page) {
2093 		__free_page(sk->sk_sndmsg_page);
2094 		sk->sk_sndmsg_page = NULL;
2095 	}
2096 
2097 	atomic_dec(&tcp_sockets_allocated);
2098 
2099 	return 0;
2100 }
2101 
2102 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2103 
2104 #ifdef CONFIG_PROC_FS
2105 /* Proc filesystem TCP sock list dumping. */
2106 
2107 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2108 {
2109 	return hlist_empty(head) ? NULL :
2110 		list_entry(head->first, struct tcp_tw_bucket, tw_node);
2111 }
2112 
2113 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2114 {
2115 	return tw->tw_node.next ?
2116 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2117 }
2118 
2119 static void *listening_get_next(struct seq_file *seq, void *cur)
2120 {
2121 	struct tcp_sock *tp;
2122 	struct hlist_node *node;
2123 	struct sock *sk = cur;
2124 	struct tcp_iter_state* st = seq->private;
2125 
2126 	if (!sk) {
2127 		st->bucket = 0;
2128 		sk = sk_head(&tcp_listening_hash[0]);
2129 		goto get_sk;
2130 	}
2131 
2132 	++st->num;
2133 
2134 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2135 		struct request_sock *req = cur;
2136 
2137 	       	tp = tcp_sk(st->syn_wait_sk);
2138 		req = req->dl_next;
2139 		while (1) {
2140 			while (req) {
2141 				if (req->rsk_ops->family == st->family) {
2142 					cur = req;
2143 					goto out;
2144 				}
2145 				req = req->dl_next;
2146 			}
2147 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
2148 				break;
2149 get_req:
2150 			req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2151 		}
2152 		sk	  = sk_next(st->syn_wait_sk);
2153 		st->state = TCP_SEQ_STATE_LISTENING;
2154 		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2155 	} else {
2156 	       	tp = tcp_sk(sk);
2157 		read_lock_bh(&tp->accept_queue.syn_wait_lock);
2158 		if (reqsk_queue_len(&tp->accept_queue))
2159 			goto start_req;
2160 		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2161 		sk = sk_next(sk);
2162 	}
2163 get_sk:
2164 	sk_for_each_from(sk, node) {
2165 		if (sk->sk_family == st->family) {
2166 			cur = sk;
2167 			goto out;
2168 		}
2169 	       	tp = tcp_sk(sk);
2170 		read_lock_bh(&tp->accept_queue.syn_wait_lock);
2171 		if (reqsk_queue_len(&tp->accept_queue)) {
2172 start_req:
2173 			st->uid		= sock_i_uid(sk);
2174 			st->syn_wait_sk = sk;
2175 			st->state	= TCP_SEQ_STATE_OPENREQ;
2176 			st->sbucket	= 0;
2177 			goto get_req;
2178 		}
2179 		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2180 	}
2181 	if (++st->bucket < TCP_LHTABLE_SIZE) {
2182 		sk = sk_head(&tcp_listening_hash[st->bucket]);
2183 		goto get_sk;
2184 	}
2185 	cur = NULL;
2186 out:
2187 	return cur;
2188 }
2189 
2190 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2191 {
2192 	void *rc = listening_get_next(seq, NULL);
2193 
2194 	while (rc && *pos) {
2195 		rc = listening_get_next(seq, rc);
2196 		--*pos;
2197 	}
2198 	return rc;
2199 }
2200 
2201 static void *established_get_first(struct seq_file *seq)
2202 {
2203 	struct tcp_iter_state* st = seq->private;
2204 	void *rc = NULL;
2205 
2206 	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2207 		struct sock *sk;
2208 		struct hlist_node *node;
2209 		struct tcp_tw_bucket *tw;
2210 
2211 		/* We can reschedule _before_ having picked the target: */
2212 		cond_resched_softirq();
2213 
2214 		read_lock(&tcp_ehash[st->bucket].lock);
2215 		sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2216 			if (sk->sk_family != st->family) {
2217 				continue;
2218 			}
2219 			rc = sk;
2220 			goto out;
2221 		}
2222 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2223 		tw_for_each(tw, node,
2224 			    &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2225 			if (tw->tw_family != st->family) {
2226 				continue;
2227 			}
2228 			rc = tw;
2229 			goto out;
2230 		}
2231 		read_unlock(&tcp_ehash[st->bucket].lock);
2232 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2233 	}
2234 out:
2235 	return rc;
2236 }
2237 
2238 static void *established_get_next(struct seq_file *seq, void *cur)
2239 {
2240 	struct sock *sk = cur;
2241 	struct tcp_tw_bucket *tw;
2242 	struct hlist_node *node;
2243 	struct tcp_iter_state* st = seq->private;
2244 
2245 	++st->num;
2246 
2247 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2248 		tw = cur;
2249 		tw = tw_next(tw);
2250 get_tw:
2251 		while (tw && tw->tw_family != st->family) {
2252 			tw = tw_next(tw);
2253 		}
2254 		if (tw) {
2255 			cur = tw;
2256 			goto out;
2257 		}
2258 		read_unlock(&tcp_ehash[st->bucket].lock);
2259 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2260 
2261 		/* We can reschedule between buckets: */
2262 		cond_resched_softirq();
2263 
2264 		if (++st->bucket < tcp_ehash_size) {
2265 			read_lock(&tcp_ehash[st->bucket].lock);
2266 			sk = sk_head(&tcp_ehash[st->bucket].chain);
2267 		} else {
2268 			cur = NULL;
2269 			goto out;
2270 		}
2271 	} else
2272 		sk = sk_next(sk);
2273 
2274 	sk_for_each_from(sk, node) {
2275 		if (sk->sk_family == st->family)
2276 			goto found;
2277 	}
2278 
2279 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2280 	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2281 	goto get_tw;
2282 found:
2283 	cur = sk;
2284 out:
2285 	return cur;
2286 }
2287 
2288 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2289 {
2290 	void *rc = established_get_first(seq);
2291 
2292 	while (rc && pos) {
2293 		rc = established_get_next(seq, rc);
2294 		--pos;
2295 	}
2296 	return rc;
2297 }
2298 
2299 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2300 {
2301 	void *rc;
2302 	struct tcp_iter_state* st = seq->private;
2303 
2304 	tcp_listen_lock();
2305 	st->state = TCP_SEQ_STATE_LISTENING;
2306 	rc	  = listening_get_idx(seq, &pos);
2307 
2308 	if (!rc) {
2309 		tcp_listen_unlock();
2310 		local_bh_disable();
2311 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2312 		rc	  = established_get_idx(seq, pos);
2313 	}
2314 
2315 	return rc;
2316 }
2317 
2318 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2319 {
2320 	struct tcp_iter_state* st = seq->private;
2321 	st->state = TCP_SEQ_STATE_LISTENING;
2322 	st->num = 0;
2323 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2324 }
2325 
2326 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2327 {
2328 	void *rc = NULL;
2329 	struct tcp_iter_state* st;
2330 
2331 	if (v == SEQ_START_TOKEN) {
2332 		rc = tcp_get_idx(seq, 0);
2333 		goto out;
2334 	}
2335 	st = seq->private;
2336 
2337 	switch (st->state) {
2338 	case TCP_SEQ_STATE_OPENREQ:
2339 	case TCP_SEQ_STATE_LISTENING:
2340 		rc = listening_get_next(seq, v);
2341 		if (!rc) {
2342 			tcp_listen_unlock();
2343 			local_bh_disable();
2344 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2345 			rc	  = established_get_first(seq);
2346 		}
2347 		break;
2348 	case TCP_SEQ_STATE_ESTABLISHED:
2349 	case TCP_SEQ_STATE_TIME_WAIT:
2350 		rc = established_get_next(seq, v);
2351 		break;
2352 	}
2353 out:
2354 	++*pos;
2355 	return rc;
2356 }
2357 
2358 static void tcp_seq_stop(struct seq_file *seq, void *v)
2359 {
2360 	struct tcp_iter_state* st = seq->private;
2361 
2362 	switch (st->state) {
2363 	case TCP_SEQ_STATE_OPENREQ:
2364 		if (v) {
2365 			struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2366 			read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2367 		}
2368 	case TCP_SEQ_STATE_LISTENING:
2369 		if (v != SEQ_START_TOKEN)
2370 			tcp_listen_unlock();
2371 		break;
2372 	case TCP_SEQ_STATE_TIME_WAIT:
2373 	case TCP_SEQ_STATE_ESTABLISHED:
2374 		if (v)
2375 			read_unlock(&tcp_ehash[st->bucket].lock);
2376 		local_bh_enable();
2377 		break;
2378 	}
2379 }
2380 
2381 static int tcp_seq_open(struct inode *inode, struct file *file)
2382 {
2383 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2384 	struct seq_file *seq;
2385 	struct tcp_iter_state *s;
2386 	int rc;
2387 
2388 	if (unlikely(afinfo == NULL))
2389 		return -EINVAL;
2390 
2391 	s = kmalloc(sizeof(*s), GFP_KERNEL);
2392 	if (!s)
2393 		return -ENOMEM;
2394 	memset(s, 0, sizeof(*s));
2395 	s->family		= afinfo->family;
2396 	s->seq_ops.start	= tcp_seq_start;
2397 	s->seq_ops.next		= tcp_seq_next;
2398 	s->seq_ops.show		= afinfo->seq_show;
2399 	s->seq_ops.stop		= tcp_seq_stop;
2400 
2401 	rc = seq_open(file, &s->seq_ops);
2402 	if (rc)
2403 		goto out_kfree;
2404 	seq	     = file->private_data;
2405 	seq->private = s;
2406 out:
2407 	return rc;
2408 out_kfree:
2409 	kfree(s);
2410 	goto out;
2411 }
2412 
2413 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2414 {
2415 	int rc = 0;
2416 	struct proc_dir_entry *p;
2417 
2418 	if (!afinfo)
2419 		return -EINVAL;
2420 	afinfo->seq_fops->owner		= afinfo->owner;
2421 	afinfo->seq_fops->open		= tcp_seq_open;
2422 	afinfo->seq_fops->read		= seq_read;
2423 	afinfo->seq_fops->llseek	= seq_lseek;
2424 	afinfo->seq_fops->release	= seq_release_private;
2425 
2426 	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2427 	if (p)
2428 		p->data = afinfo;
2429 	else
2430 		rc = -ENOMEM;
2431 	return rc;
2432 }
2433 
2434 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2435 {
2436 	if (!afinfo)
2437 		return;
2438 	proc_net_remove(afinfo->name);
2439 	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2440 }
2441 
2442 static void get_openreq4(struct sock *sk, struct request_sock *req,
2443 			 char *tmpbuf, int i, int uid)
2444 {
2445 	const struct inet_request_sock *ireq = inet_rsk(req);
2446 	int ttd = req->expires - jiffies;
2447 
2448 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2449 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2450 		i,
2451 		ireq->loc_addr,
2452 		ntohs(inet_sk(sk)->sport),
2453 		ireq->rmt_addr,
2454 		ntohs(ireq->rmt_port),
2455 		TCP_SYN_RECV,
2456 		0, 0, /* could print option size, but that is af dependent. */
2457 		1,    /* timers active (only the expire timer) */
2458 		jiffies_to_clock_t(ttd),
2459 		req->retrans,
2460 		uid,
2461 		0,  /* non standard timer */
2462 		0, /* open_requests have no inode */
2463 		atomic_read(&sk->sk_refcnt),
2464 		req);
2465 }
2466 
2467 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2468 {
2469 	int timer_active;
2470 	unsigned long timer_expires;
2471 	struct tcp_sock *tp = tcp_sk(sp);
2472 	struct inet_sock *inet = inet_sk(sp);
2473 	unsigned int dest = inet->daddr;
2474 	unsigned int src = inet->rcv_saddr;
2475 	__u16 destp = ntohs(inet->dport);
2476 	__u16 srcp = ntohs(inet->sport);
2477 
2478 	if (tp->pending == TCP_TIME_RETRANS) {
2479 		timer_active	= 1;
2480 		timer_expires	= tp->timeout;
2481 	} else if (tp->pending == TCP_TIME_PROBE0) {
2482 		timer_active	= 4;
2483 		timer_expires	= tp->timeout;
2484 	} else if (timer_pending(&sp->sk_timer)) {
2485 		timer_active	= 2;
2486 		timer_expires	= sp->sk_timer.expires;
2487 	} else {
2488 		timer_active	= 0;
2489 		timer_expires = jiffies;
2490 	}
2491 
2492 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2493 			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
2494 		i, src, srcp, dest, destp, sp->sk_state,
2495 		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2496 		timer_active,
2497 		jiffies_to_clock_t(timer_expires - jiffies),
2498 		tp->retransmits,
2499 		sock_i_uid(sp),
2500 		tp->probes_out,
2501 		sock_i_ino(sp),
2502 		atomic_read(&sp->sk_refcnt), sp,
2503 		tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2504 		tp->snd_cwnd,
2505 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2506 }
2507 
2508 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2509 {
2510 	unsigned int dest, src;
2511 	__u16 destp, srcp;
2512 	int ttd = tw->tw_ttd - jiffies;
2513 
2514 	if (ttd < 0)
2515 		ttd = 0;
2516 
2517 	dest  = tw->tw_daddr;
2518 	src   = tw->tw_rcv_saddr;
2519 	destp = ntohs(tw->tw_dport);
2520 	srcp  = ntohs(tw->tw_sport);
2521 
2522 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2523 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2524 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2525 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2526 		atomic_read(&tw->tw_refcnt), tw);
2527 }
2528 
2529 #define TMPSZ 150
2530 
2531 static int tcp4_seq_show(struct seq_file *seq, void *v)
2532 {
2533 	struct tcp_iter_state* st;
2534 	char tmpbuf[TMPSZ + 1];
2535 
2536 	if (v == SEQ_START_TOKEN) {
2537 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2538 			   "  sl  local_address rem_address   st tx_queue "
2539 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2540 			   "inode");
2541 		goto out;
2542 	}
2543 	st = seq->private;
2544 
2545 	switch (st->state) {
2546 	case TCP_SEQ_STATE_LISTENING:
2547 	case TCP_SEQ_STATE_ESTABLISHED:
2548 		get_tcp4_sock(v, tmpbuf, st->num);
2549 		break;
2550 	case TCP_SEQ_STATE_OPENREQ:
2551 		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2552 		break;
2553 	case TCP_SEQ_STATE_TIME_WAIT:
2554 		get_timewait4_sock(v, tmpbuf, st->num);
2555 		break;
2556 	}
2557 	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2558 out:
2559 	return 0;
2560 }
2561 
2562 static struct file_operations tcp4_seq_fops;
2563 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2564 	.owner		= THIS_MODULE,
2565 	.name		= "tcp",
2566 	.family		= AF_INET,
2567 	.seq_show	= tcp4_seq_show,
2568 	.seq_fops	= &tcp4_seq_fops,
2569 };
2570 
2571 int __init tcp4_proc_init(void)
2572 {
2573 	return tcp_proc_register(&tcp4_seq_afinfo);
2574 }
2575 
2576 void tcp4_proc_exit(void)
2577 {
2578 	tcp_proc_unregister(&tcp4_seq_afinfo);
2579 }
2580 #endif /* CONFIG_PROC_FS */
2581 
2582 struct proto tcp_prot = {
2583 	.name			= "TCP",
2584 	.owner			= THIS_MODULE,
2585 	.close			= tcp_close,
2586 	.connect		= tcp_v4_connect,
2587 	.disconnect		= tcp_disconnect,
2588 	.accept			= tcp_accept,
2589 	.ioctl			= tcp_ioctl,
2590 	.init			= tcp_v4_init_sock,
2591 	.destroy		= tcp_v4_destroy_sock,
2592 	.shutdown		= tcp_shutdown,
2593 	.setsockopt		= tcp_setsockopt,
2594 	.getsockopt		= tcp_getsockopt,
2595 	.sendmsg		= tcp_sendmsg,
2596 	.recvmsg		= tcp_recvmsg,
2597 	.backlog_rcv		= tcp_v4_do_rcv,
2598 	.hash			= tcp_v4_hash,
2599 	.unhash			= tcp_unhash,
2600 	.get_port		= tcp_v4_get_port,
2601 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2602 	.sockets_allocated	= &tcp_sockets_allocated,
2603 	.memory_allocated	= &tcp_memory_allocated,
2604 	.memory_pressure	= &tcp_memory_pressure,
2605 	.sysctl_mem		= sysctl_tcp_mem,
2606 	.sysctl_wmem		= sysctl_tcp_wmem,
2607 	.sysctl_rmem		= sysctl_tcp_rmem,
2608 	.max_header		= MAX_TCP_HEADER,
2609 	.obj_size		= sizeof(struct tcp_sock),
2610 	.rsk_prot		= &tcp_request_sock_ops,
2611 };
2612 
2613 
2614 
2615 void __init tcp_v4_init(struct net_proto_family *ops)
2616 {
2617 	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2618 	if (err < 0)
2619 		panic("Failed to create the TCP control socket.\n");
2620 	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2621 	inet_sk(tcp_socket->sk)->uc_ttl = -1;
2622 
2623 	/* Unhash it so that IP input processing does not even
2624 	 * see it, we do not wish this socket to see incoming
2625 	 * packets.
2626 	 */
2627 	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2628 }
2629 
2630 EXPORT_SYMBOL(ipv4_specific);
2631 EXPORT_SYMBOL(tcp_bind_hash);
2632 EXPORT_SYMBOL(tcp_bucket_create);
2633 EXPORT_SYMBOL(tcp_hashinfo);
2634 EXPORT_SYMBOL(tcp_inherit_port);
2635 EXPORT_SYMBOL(tcp_listen_wlock);
2636 EXPORT_SYMBOL(tcp_port_rover);
2637 EXPORT_SYMBOL(tcp_prot);
2638 EXPORT_SYMBOL(tcp_put_port);
2639 EXPORT_SYMBOL(tcp_unhash);
2640 EXPORT_SYMBOL(tcp_v4_conn_request);
2641 EXPORT_SYMBOL(tcp_v4_connect);
2642 EXPORT_SYMBOL(tcp_v4_do_rcv);
2643 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2644 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2645 EXPORT_SYMBOL(tcp_v4_send_check);
2646 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2647 
2648 #ifdef CONFIG_PROC_FS
2649 EXPORT_SYMBOL(tcp_proc_register);
2650 EXPORT_SYMBOL(tcp_proc_unregister);
2651 #endif
2652 EXPORT_SYMBOL(sysctl_local_port_range);
2653 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2654 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2655 
2656