xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision b6dcefde)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83 
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86 
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90 						   __be32 addr);
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97 	return NULL;
98 }
99 #endif
100 
101 struct inet_hashinfo tcp_hashinfo;
102 
103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104 {
105 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106 					  ip_hdr(skb)->saddr,
107 					  tcp_hdr(skb)->dest,
108 					  tcp_hdr(skb)->source);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 
142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143 
144 /* This will initiate an outgoing connection. */
145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146 {
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150 	struct rtable *rt;
151 	__be32 daddr, nexthop;
152 	int tmp;
153 	int err;
154 
155 	if (addr_len < sizeof(struct sockaddr_in))
156 		return -EINVAL;
157 
158 	if (usin->sin_family != AF_INET)
159 		return -EAFNOSUPPORT;
160 
161 	nexthop = daddr = usin->sin_addr.s_addr;
162 	if (inet->opt && inet->opt->srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet->opt->faddr;
166 	}
167 
168 	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170 			       IPPROTO_TCP,
171 			       inet->inet_sport, usin->sin_port, sk, 1);
172 	if (tmp < 0) {
173 		if (tmp == -ENETUNREACH)
174 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175 		return tmp;
176 	}
177 
178 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179 		ip_rt_put(rt);
180 		return -ENETUNREACH;
181 	}
182 
183 	if (!inet->opt || !inet->opt->srr)
184 		daddr = rt->rt_dst;
185 
186 	if (!inet->inet_saddr)
187 		inet->inet_saddr = rt->rt_src;
188 	inet->inet_rcv_saddr = inet->inet_saddr;
189 
190 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191 		/* Reset inherited state */
192 		tp->rx_opt.ts_recent	   = 0;
193 		tp->rx_opt.ts_recent_stamp = 0;
194 		tp->write_seq		   = 0;
195 	}
196 
197 	if (tcp_death_row.sysctl_tw_recycle &&
198 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199 		struct inet_peer *peer = rt_get_peer(rt);
200 		/*
201 		 * VJ's idea. We save last timestamp seen from
202 		 * the destination in peer table, when entering state
203 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204 		 * when trying new connection.
205 		 */
206 		if (peer != NULL &&
207 		    (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209 			tp->rx_opt.ts_recent = peer->tcp_ts;
210 		}
211 	}
212 
213 	inet->inet_dport = usin->sin_port;
214 	inet->inet_daddr = daddr;
215 
216 	inet_csk(sk)->icsk_ext_hdr_len = 0;
217 	if (inet->opt)
218 		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219 
220 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221 
222 	/* Socket identity is still unknown (sport may be zero).
223 	 * However we set state to SYN-SENT and not releasing socket
224 	 * lock select source port, enter ourselves into the hash tables and
225 	 * complete initialization after this.
226 	 */
227 	tcp_set_state(sk, TCP_SYN_SENT);
228 	err = inet_hash_connect(&tcp_death_row, sk);
229 	if (err)
230 		goto failure;
231 
232 	err = ip_route_newports(&rt, IPPROTO_TCP,
233 				inet->inet_sport, inet->inet_dport, sk);
234 	if (err)
235 		goto failure;
236 
237 	/* OK, now commit destination to socket.  */
238 	sk->sk_gso_type = SKB_GSO_TCPV4;
239 	sk_setup_caps(sk, &rt->u.dst);
240 
241 	if (!tp->write_seq)
242 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243 							   inet->inet_daddr,
244 							   inet->inet_sport,
245 							   usin->sin_port);
246 
247 	inet->inet_id = tp->write_seq ^ jiffies;
248 
249 	err = tcp_connect(sk);
250 	rt = NULL;
251 	if (err)
252 		goto failure;
253 
254 	return 0;
255 
256 failure:
257 	/*
258 	 * This unhashes the socket and releases the local port,
259 	 * if necessary.
260 	 */
261 	tcp_set_state(sk, TCP_CLOSE);
262 	ip_rt_put(rt);
263 	sk->sk_route_caps = 0;
264 	inet->inet_dport = 0;
265 	return err;
266 }
267 
268 /*
269  * This routine does path mtu discovery as defined in RFC1191.
270  */
271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 
276 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277 	 * send out by Linux are always <576bytes so they should go through
278 	 * unfragmented).
279 	 */
280 	if (sk->sk_state == TCP_LISTEN)
281 		return;
282 
283 	/* We don't check in the destentry if pmtu discovery is forbidden
284 	 * on this route. We just assume that no packet_to_big packets
285 	 * are send back when pmtu discovery is not active.
286 	 * There is a small race when the user changes this flag in the
287 	 * route, but I think that's acceptable.
288 	 */
289 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
290 		return;
291 
292 	dst->ops->update_pmtu(dst, mtu);
293 
294 	/* Something is about to be wrong... Remember soft error
295 	 * for the case, if this connection will not able to recover.
296 	 */
297 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298 		sk->sk_err_soft = EMSGSIZE;
299 
300 	mtu = dst_mtu(dst);
301 
302 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304 		tcp_sync_mss(sk, mtu);
305 
306 		/* Resend the TCP packet because it's
307 		 * clear that the old packet has been
308 		 * dropped. This is the new "fast" path mtu
309 		 * discovery.
310 		 */
311 		tcp_simple_retransmit(sk);
312 	} /* else let the usual retransmit timer handle it */
313 }
314 
315 /*
316  * This routine is called by the ICMP module when it gets some
317  * sort of error condition.  If err < 0 then the socket should
318  * be closed and the error returned to the user.  If err > 0
319  * it's just the icmp type << 8 | icmp code.  After adjustment
320  * header points to the first 8 bytes of the tcp header.  We need
321  * to find the appropriate port.
322  *
323  * The locking strategy used here is very "optimistic". When
324  * someone else accesses the socket the ICMP is just dropped
325  * and for some paths there is no check at all.
326  * A more general error queue to queue errors for later handling
327  * is probably better.
328  *
329  */
330 
331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332 {
333 	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
335 	struct inet_connection_sock *icsk;
336 	struct tcp_sock *tp;
337 	struct inet_sock *inet;
338 	const int type = icmp_hdr(icmp_skb)->type;
339 	const int code = icmp_hdr(icmp_skb)->code;
340 	struct sock *sk;
341 	struct sk_buff *skb;
342 	__u32 seq;
343 	__u32 remaining;
344 	int err;
345 	struct net *net = dev_net(icmp_skb->dev);
346 
347 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
348 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349 		return;
350 	}
351 
352 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353 			iph->saddr, th->source, inet_iif(icmp_skb));
354 	if (!sk) {
355 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356 		return;
357 	}
358 	if (sk->sk_state == TCP_TIME_WAIT) {
359 		inet_twsk_put(inet_twsk(sk));
360 		return;
361 	}
362 
363 	bh_lock_sock(sk);
364 	/* If too many ICMPs get dropped on busy
365 	 * servers this needs to be solved differently.
366 	 */
367 	if (sock_owned_by_user(sk))
368 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369 
370 	if (sk->sk_state == TCP_CLOSE)
371 		goto out;
372 
373 	icsk = inet_csk(sk);
374 	tp = tcp_sk(sk);
375 	seq = ntohl(th->seq);
376 	if (sk->sk_state != TCP_LISTEN &&
377 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
378 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
379 		goto out;
380 	}
381 
382 	switch (type) {
383 	case ICMP_SOURCE_QUENCH:
384 		/* Just silently ignore these. */
385 		goto out;
386 	case ICMP_PARAMETERPROB:
387 		err = EPROTO;
388 		break;
389 	case ICMP_DEST_UNREACH:
390 		if (code > NR_ICMP_UNREACH)
391 			goto out;
392 
393 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394 			if (!sock_owned_by_user(sk))
395 				do_pmtu_discovery(sk, iph, info);
396 			goto out;
397 		}
398 
399 		err = icmp_err_convert[code].errno;
400 		/* check if icmp_skb allows revert of backoff
401 		 * (see draft-zimmermann-tcp-lcd) */
402 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
403 			break;
404 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
405 		    !icsk->icsk_backoff)
406 			break;
407 
408 		icsk->icsk_backoff--;
409 		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
410 					 icsk->icsk_backoff;
411 		tcp_bound_rto(sk);
412 
413 		skb = tcp_write_queue_head(sk);
414 		BUG_ON(!skb);
415 
416 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
417 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
418 
419 		if (remaining) {
420 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
421 						  remaining, TCP_RTO_MAX);
422 		} else if (sock_owned_by_user(sk)) {
423 			/* RTO revert clocked out retransmission,
424 			 * but socket is locked. Will defer. */
425 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
426 						  HZ/20, TCP_RTO_MAX);
427 		} else {
428 			/* RTO revert clocked out retransmission.
429 			 * Will retransmit now */
430 			tcp_retransmit_timer(sk);
431 		}
432 
433 		break;
434 	case ICMP_TIME_EXCEEDED:
435 		err = EHOSTUNREACH;
436 		break;
437 	default:
438 		goto out;
439 	}
440 
441 	switch (sk->sk_state) {
442 		struct request_sock *req, **prev;
443 	case TCP_LISTEN:
444 		if (sock_owned_by_user(sk))
445 			goto out;
446 
447 		req = inet_csk_search_req(sk, &prev, th->dest,
448 					  iph->daddr, iph->saddr);
449 		if (!req)
450 			goto out;
451 
452 		/* ICMPs are not backlogged, hence we cannot get
453 		   an established socket here.
454 		 */
455 		WARN_ON(req->sk);
456 
457 		if (seq != tcp_rsk(req)->snt_isn) {
458 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
459 			goto out;
460 		}
461 
462 		/*
463 		 * Still in SYN_RECV, just remove it silently.
464 		 * There is no good way to pass the error to the newly
465 		 * created socket, and POSIX does not want network
466 		 * errors returned from accept().
467 		 */
468 		inet_csk_reqsk_queue_drop(sk, req, prev);
469 		goto out;
470 
471 	case TCP_SYN_SENT:
472 	case TCP_SYN_RECV:  /* Cannot happen.
473 			       It can f.e. if SYNs crossed.
474 			     */
475 		if (!sock_owned_by_user(sk)) {
476 			sk->sk_err = err;
477 
478 			sk->sk_error_report(sk);
479 
480 			tcp_done(sk);
481 		} else {
482 			sk->sk_err_soft = err;
483 		}
484 		goto out;
485 	}
486 
487 	/* If we've already connected we will keep trying
488 	 * until we time out, or the user gives up.
489 	 *
490 	 * rfc1122 4.2.3.9 allows to consider as hard errors
491 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
492 	 * but it is obsoleted by pmtu discovery).
493 	 *
494 	 * Note, that in modern internet, where routing is unreliable
495 	 * and in each dark corner broken firewalls sit, sending random
496 	 * errors ordered by their masters even this two messages finally lose
497 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
498 	 *
499 	 * Now we are in compliance with RFCs.
500 	 *							--ANK (980905)
501 	 */
502 
503 	inet = inet_sk(sk);
504 	if (!sock_owned_by_user(sk) && inet->recverr) {
505 		sk->sk_err = err;
506 		sk->sk_error_report(sk);
507 	} else	{ /* Only an error on timeout */
508 		sk->sk_err_soft = err;
509 	}
510 
511 out:
512 	bh_unlock_sock(sk);
513 	sock_put(sk);
514 }
515 
516 /* This routine computes an IPv4 TCP checksum. */
517 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
518 {
519 	struct inet_sock *inet = inet_sk(sk);
520 	struct tcphdr *th = tcp_hdr(skb);
521 
522 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
523 		th->check = ~tcp_v4_check(len, inet->inet_saddr,
524 					  inet->inet_daddr, 0);
525 		skb->csum_start = skb_transport_header(skb) - skb->head;
526 		skb->csum_offset = offsetof(struct tcphdr, check);
527 	} else {
528 		th->check = tcp_v4_check(len, inet->inet_saddr,
529 					 inet->inet_daddr,
530 					 csum_partial(th,
531 						      th->doff << 2,
532 						      skb->csum));
533 	}
534 }
535 
536 int tcp_v4_gso_send_check(struct sk_buff *skb)
537 {
538 	const struct iphdr *iph;
539 	struct tcphdr *th;
540 
541 	if (!pskb_may_pull(skb, sizeof(*th)))
542 		return -EINVAL;
543 
544 	iph = ip_hdr(skb);
545 	th = tcp_hdr(skb);
546 
547 	th->check = 0;
548 	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
549 	skb->csum_start = skb_transport_header(skb) - skb->head;
550 	skb->csum_offset = offsetof(struct tcphdr, check);
551 	skb->ip_summed = CHECKSUM_PARTIAL;
552 	return 0;
553 }
554 
555 /*
556  *	This routine will send an RST to the other tcp.
557  *
558  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
559  *		      for reset.
560  *	Answer: if a packet caused RST, it is not for a socket
561  *		existing in our system, if it is matched to a socket,
562  *		it is just duplicate segment or bug in other side's TCP.
563  *		So that we build reply only basing on parameters
564  *		arrived with segment.
565  *	Exception: precedence violation. We do not implement it in any case.
566  */
567 
568 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
569 {
570 	struct tcphdr *th = tcp_hdr(skb);
571 	struct {
572 		struct tcphdr th;
573 #ifdef CONFIG_TCP_MD5SIG
574 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
575 #endif
576 	} rep;
577 	struct ip_reply_arg arg;
578 #ifdef CONFIG_TCP_MD5SIG
579 	struct tcp_md5sig_key *key;
580 #endif
581 	struct net *net;
582 
583 	/* Never send a reset in response to a reset. */
584 	if (th->rst)
585 		return;
586 
587 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
588 		return;
589 
590 	/* Swap the send and the receive. */
591 	memset(&rep, 0, sizeof(rep));
592 	rep.th.dest   = th->source;
593 	rep.th.source = th->dest;
594 	rep.th.doff   = sizeof(struct tcphdr) / 4;
595 	rep.th.rst    = 1;
596 
597 	if (th->ack) {
598 		rep.th.seq = th->ack_seq;
599 	} else {
600 		rep.th.ack = 1;
601 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
602 				       skb->len - (th->doff << 2));
603 	}
604 
605 	memset(&arg, 0, sizeof(arg));
606 	arg.iov[0].iov_base = (unsigned char *)&rep;
607 	arg.iov[0].iov_len  = sizeof(rep.th);
608 
609 #ifdef CONFIG_TCP_MD5SIG
610 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
611 	if (key) {
612 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
613 				   (TCPOPT_NOP << 16) |
614 				   (TCPOPT_MD5SIG << 8) |
615 				   TCPOLEN_MD5SIG);
616 		/* Update length and the length the header thinks exists */
617 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
618 		rep.th.doff = arg.iov[0].iov_len / 4;
619 
620 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
621 				     key, ip_hdr(skb)->saddr,
622 				     ip_hdr(skb)->daddr, &rep.th);
623 	}
624 #endif
625 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
626 				      ip_hdr(skb)->saddr, /* XXX */
627 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
628 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
629 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
630 
631 	net = dev_net(skb_dst(skb)->dev);
632 	ip_send_reply(net->ipv4.tcp_sock, skb,
633 		      &arg, arg.iov[0].iov_len);
634 
635 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
636 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
637 }
638 
639 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
640    outside socket context is ugly, certainly. What can I do?
641  */
642 
643 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
644 			    u32 win, u32 ts, int oif,
645 			    struct tcp_md5sig_key *key,
646 			    int reply_flags)
647 {
648 	struct tcphdr *th = tcp_hdr(skb);
649 	struct {
650 		struct tcphdr th;
651 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
652 #ifdef CONFIG_TCP_MD5SIG
653 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
654 #endif
655 			];
656 	} rep;
657 	struct ip_reply_arg arg;
658 	struct net *net = dev_net(skb_dst(skb)->dev);
659 
660 	memset(&rep.th, 0, sizeof(struct tcphdr));
661 	memset(&arg, 0, sizeof(arg));
662 
663 	arg.iov[0].iov_base = (unsigned char *)&rep;
664 	arg.iov[0].iov_len  = sizeof(rep.th);
665 	if (ts) {
666 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
667 				   (TCPOPT_TIMESTAMP << 8) |
668 				   TCPOLEN_TIMESTAMP);
669 		rep.opt[1] = htonl(tcp_time_stamp);
670 		rep.opt[2] = htonl(ts);
671 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
672 	}
673 
674 	/* Swap the send and the receive. */
675 	rep.th.dest    = th->source;
676 	rep.th.source  = th->dest;
677 	rep.th.doff    = arg.iov[0].iov_len / 4;
678 	rep.th.seq     = htonl(seq);
679 	rep.th.ack_seq = htonl(ack);
680 	rep.th.ack     = 1;
681 	rep.th.window  = htons(win);
682 
683 #ifdef CONFIG_TCP_MD5SIG
684 	if (key) {
685 		int offset = (ts) ? 3 : 0;
686 
687 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
688 					  (TCPOPT_NOP << 16) |
689 					  (TCPOPT_MD5SIG << 8) |
690 					  TCPOLEN_MD5SIG);
691 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
692 		rep.th.doff = arg.iov[0].iov_len/4;
693 
694 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
695 				    key, ip_hdr(skb)->saddr,
696 				    ip_hdr(skb)->daddr, &rep.th);
697 	}
698 #endif
699 	arg.flags = reply_flags;
700 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
701 				      ip_hdr(skb)->saddr, /* XXX */
702 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
703 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
704 	if (oif)
705 		arg.bound_dev_if = oif;
706 
707 	ip_send_reply(net->ipv4.tcp_sock, skb,
708 		      &arg, arg.iov[0].iov_len);
709 
710 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
711 }
712 
713 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
714 {
715 	struct inet_timewait_sock *tw = inet_twsk(sk);
716 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
717 
718 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
719 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
720 			tcptw->tw_ts_recent,
721 			tw->tw_bound_dev_if,
722 			tcp_twsk_md5_key(tcptw),
723 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
724 			);
725 
726 	inet_twsk_put(tw);
727 }
728 
729 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
730 				  struct request_sock *req)
731 {
732 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
733 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
734 			req->ts_recent,
735 			0,
736 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
737 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
738 }
739 
740 /*
741  *	Send a SYN-ACK after having received a SYN.
742  *	This still operates on a request_sock only, not on a big
743  *	socket.
744  */
745 static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
746 				struct request_sock *req,
747 				struct request_values *rvp)
748 {
749 	const struct inet_request_sock *ireq = inet_rsk(req);
750 	int err = -1;
751 	struct sk_buff * skb;
752 
753 	/* First, grab a route. */
754 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
755 		return -1;
756 
757 	skb = tcp_make_synack(sk, dst, req, rvp);
758 
759 	if (skb) {
760 		struct tcphdr *th = tcp_hdr(skb);
761 
762 		th->check = tcp_v4_check(skb->len,
763 					 ireq->loc_addr,
764 					 ireq->rmt_addr,
765 					 csum_partial(th, skb->len,
766 						      skb->csum));
767 
768 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
769 					    ireq->rmt_addr,
770 					    ireq->opt);
771 		err = net_xmit_eval(err);
772 	}
773 
774 	dst_release(dst);
775 	return err;
776 }
777 
778 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
779 			      struct request_values *rvp)
780 {
781 	return __tcp_v4_send_synack(sk, NULL, req, rvp);
782 }
783 
784 /*
785  *	IPv4 request_sock destructor.
786  */
787 static void tcp_v4_reqsk_destructor(struct request_sock *req)
788 {
789 	kfree(inet_rsk(req)->opt);
790 }
791 
792 #ifdef CONFIG_SYN_COOKIES
793 static void syn_flood_warning(struct sk_buff *skb)
794 {
795 	static unsigned long warntime;
796 
797 	if (time_after(jiffies, (warntime + HZ * 60))) {
798 		warntime = jiffies;
799 		printk(KERN_INFO
800 		       "possible SYN flooding on port %d. Sending cookies.\n",
801 		       ntohs(tcp_hdr(skb)->dest));
802 	}
803 }
804 #endif
805 
806 /*
807  * Save and compile IPv4 options into the request_sock if needed.
808  */
809 static struct ip_options *tcp_v4_save_options(struct sock *sk,
810 					      struct sk_buff *skb)
811 {
812 	struct ip_options *opt = &(IPCB(skb)->opt);
813 	struct ip_options *dopt = NULL;
814 
815 	if (opt && opt->optlen) {
816 		int opt_size = optlength(opt);
817 		dopt = kmalloc(opt_size, GFP_ATOMIC);
818 		if (dopt) {
819 			if (ip_options_echo(dopt, skb)) {
820 				kfree(dopt);
821 				dopt = NULL;
822 			}
823 		}
824 	}
825 	return dopt;
826 }
827 
828 #ifdef CONFIG_TCP_MD5SIG
829 /*
830  * RFC2385 MD5 checksumming requires a mapping of
831  * IP address->MD5 Key.
832  * We need to maintain these in the sk structure.
833  */
834 
835 /* Find the Key structure for an address.  */
836 static struct tcp_md5sig_key *
837 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
838 {
839 	struct tcp_sock *tp = tcp_sk(sk);
840 	int i;
841 
842 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
843 		return NULL;
844 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
845 		if (tp->md5sig_info->keys4[i].addr == addr)
846 			return &tp->md5sig_info->keys4[i].base;
847 	}
848 	return NULL;
849 }
850 
851 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
852 					 struct sock *addr_sk)
853 {
854 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
855 }
856 
857 EXPORT_SYMBOL(tcp_v4_md5_lookup);
858 
859 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
860 						      struct request_sock *req)
861 {
862 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
863 }
864 
865 /* This can be called on a newly created socket, from other files */
866 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
867 		      u8 *newkey, u8 newkeylen)
868 {
869 	/* Add Key to the list */
870 	struct tcp_md5sig_key *key;
871 	struct tcp_sock *tp = tcp_sk(sk);
872 	struct tcp4_md5sig_key *keys;
873 
874 	key = tcp_v4_md5_do_lookup(sk, addr);
875 	if (key) {
876 		/* Pre-existing entry - just update that one. */
877 		kfree(key->key);
878 		key->key = newkey;
879 		key->keylen = newkeylen;
880 	} else {
881 		struct tcp_md5sig_info *md5sig;
882 
883 		if (!tp->md5sig_info) {
884 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
885 						  GFP_ATOMIC);
886 			if (!tp->md5sig_info) {
887 				kfree(newkey);
888 				return -ENOMEM;
889 			}
890 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
891 		}
892 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
893 			kfree(newkey);
894 			return -ENOMEM;
895 		}
896 		md5sig = tp->md5sig_info;
897 
898 		if (md5sig->alloced4 == md5sig->entries4) {
899 			keys = kmalloc((sizeof(*keys) *
900 					(md5sig->entries4 + 1)), GFP_ATOMIC);
901 			if (!keys) {
902 				kfree(newkey);
903 				tcp_free_md5sig_pool();
904 				return -ENOMEM;
905 			}
906 
907 			if (md5sig->entries4)
908 				memcpy(keys, md5sig->keys4,
909 				       sizeof(*keys) * md5sig->entries4);
910 
911 			/* Free old key list, and reference new one */
912 			kfree(md5sig->keys4);
913 			md5sig->keys4 = keys;
914 			md5sig->alloced4++;
915 		}
916 		md5sig->entries4++;
917 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
918 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
919 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
920 	}
921 	return 0;
922 }
923 
924 EXPORT_SYMBOL(tcp_v4_md5_do_add);
925 
926 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
927 			       u8 *newkey, u8 newkeylen)
928 {
929 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
930 				 newkey, newkeylen);
931 }
932 
933 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
934 {
935 	struct tcp_sock *tp = tcp_sk(sk);
936 	int i;
937 
938 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
939 		if (tp->md5sig_info->keys4[i].addr == addr) {
940 			/* Free the key */
941 			kfree(tp->md5sig_info->keys4[i].base.key);
942 			tp->md5sig_info->entries4--;
943 
944 			if (tp->md5sig_info->entries4 == 0) {
945 				kfree(tp->md5sig_info->keys4);
946 				tp->md5sig_info->keys4 = NULL;
947 				tp->md5sig_info->alloced4 = 0;
948 			} else if (tp->md5sig_info->entries4 != i) {
949 				/* Need to do some manipulation */
950 				memmove(&tp->md5sig_info->keys4[i],
951 					&tp->md5sig_info->keys4[i+1],
952 					(tp->md5sig_info->entries4 - i) *
953 					 sizeof(struct tcp4_md5sig_key));
954 			}
955 			tcp_free_md5sig_pool();
956 			return 0;
957 		}
958 	}
959 	return -ENOENT;
960 }
961 
962 EXPORT_SYMBOL(tcp_v4_md5_do_del);
963 
964 static void tcp_v4_clear_md5_list(struct sock *sk)
965 {
966 	struct tcp_sock *tp = tcp_sk(sk);
967 
968 	/* Free each key, then the set of key keys,
969 	 * the crypto element, and then decrement our
970 	 * hold on the last resort crypto.
971 	 */
972 	if (tp->md5sig_info->entries4) {
973 		int i;
974 		for (i = 0; i < tp->md5sig_info->entries4; i++)
975 			kfree(tp->md5sig_info->keys4[i].base.key);
976 		tp->md5sig_info->entries4 = 0;
977 		tcp_free_md5sig_pool();
978 	}
979 	if (tp->md5sig_info->keys4) {
980 		kfree(tp->md5sig_info->keys4);
981 		tp->md5sig_info->keys4 = NULL;
982 		tp->md5sig_info->alloced4  = 0;
983 	}
984 }
985 
986 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
987 				 int optlen)
988 {
989 	struct tcp_md5sig cmd;
990 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
991 	u8 *newkey;
992 
993 	if (optlen < sizeof(cmd))
994 		return -EINVAL;
995 
996 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
997 		return -EFAULT;
998 
999 	if (sin->sin_family != AF_INET)
1000 		return -EINVAL;
1001 
1002 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1003 		if (!tcp_sk(sk)->md5sig_info)
1004 			return -ENOENT;
1005 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1006 	}
1007 
1008 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1009 		return -EINVAL;
1010 
1011 	if (!tcp_sk(sk)->md5sig_info) {
1012 		struct tcp_sock *tp = tcp_sk(sk);
1013 		struct tcp_md5sig_info *p;
1014 
1015 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1016 		if (!p)
1017 			return -EINVAL;
1018 
1019 		tp->md5sig_info = p;
1020 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1021 	}
1022 
1023 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1024 	if (!newkey)
1025 		return -ENOMEM;
1026 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1027 				 newkey, cmd.tcpm_keylen);
1028 }
1029 
1030 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1031 					__be32 daddr, __be32 saddr, int nbytes)
1032 {
1033 	struct tcp4_pseudohdr *bp;
1034 	struct scatterlist sg;
1035 
1036 	bp = &hp->md5_blk.ip4;
1037 
1038 	/*
1039 	 * 1. the TCP pseudo-header (in the order: source IP address,
1040 	 * destination IP address, zero-padded protocol number, and
1041 	 * segment length)
1042 	 */
1043 	bp->saddr = saddr;
1044 	bp->daddr = daddr;
1045 	bp->pad = 0;
1046 	bp->protocol = IPPROTO_TCP;
1047 	bp->len = cpu_to_be16(nbytes);
1048 
1049 	sg_init_one(&sg, bp, sizeof(*bp));
1050 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1051 }
1052 
1053 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1054 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1055 {
1056 	struct tcp_md5sig_pool *hp;
1057 	struct hash_desc *desc;
1058 
1059 	hp = tcp_get_md5sig_pool();
1060 	if (!hp)
1061 		goto clear_hash_noput;
1062 	desc = &hp->md5_desc;
1063 
1064 	if (crypto_hash_init(desc))
1065 		goto clear_hash;
1066 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1067 		goto clear_hash;
1068 	if (tcp_md5_hash_header(hp, th))
1069 		goto clear_hash;
1070 	if (tcp_md5_hash_key(hp, key))
1071 		goto clear_hash;
1072 	if (crypto_hash_final(desc, md5_hash))
1073 		goto clear_hash;
1074 
1075 	tcp_put_md5sig_pool();
1076 	return 0;
1077 
1078 clear_hash:
1079 	tcp_put_md5sig_pool();
1080 clear_hash_noput:
1081 	memset(md5_hash, 0, 16);
1082 	return 1;
1083 }
1084 
1085 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1086 			struct sock *sk, struct request_sock *req,
1087 			struct sk_buff *skb)
1088 {
1089 	struct tcp_md5sig_pool *hp;
1090 	struct hash_desc *desc;
1091 	struct tcphdr *th = tcp_hdr(skb);
1092 	__be32 saddr, daddr;
1093 
1094 	if (sk) {
1095 		saddr = inet_sk(sk)->inet_saddr;
1096 		daddr = inet_sk(sk)->inet_daddr;
1097 	} else if (req) {
1098 		saddr = inet_rsk(req)->loc_addr;
1099 		daddr = inet_rsk(req)->rmt_addr;
1100 	} else {
1101 		const struct iphdr *iph = ip_hdr(skb);
1102 		saddr = iph->saddr;
1103 		daddr = iph->daddr;
1104 	}
1105 
1106 	hp = tcp_get_md5sig_pool();
1107 	if (!hp)
1108 		goto clear_hash_noput;
1109 	desc = &hp->md5_desc;
1110 
1111 	if (crypto_hash_init(desc))
1112 		goto clear_hash;
1113 
1114 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1115 		goto clear_hash;
1116 	if (tcp_md5_hash_header(hp, th))
1117 		goto clear_hash;
1118 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1119 		goto clear_hash;
1120 	if (tcp_md5_hash_key(hp, key))
1121 		goto clear_hash;
1122 	if (crypto_hash_final(desc, md5_hash))
1123 		goto clear_hash;
1124 
1125 	tcp_put_md5sig_pool();
1126 	return 0;
1127 
1128 clear_hash:
1129 	tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131 	memset(md5_hash, 0, 16);
1132 	return 1;
1133 }
1134 
1135 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1136 
1137 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1138 {
1139 	/*
1140 	 * This gets called for each TCP segment that arrives
1141 	 * so we want to be efficient.
1142 	 * We have 3 drop cases:
1143 	 * o No MD5 hash and one expected.
1144 	 * o MD5 hash and we're not expecting one.
1145 	 * o MD5 hash and its wrong.
1146 	 */
1147 	__u8 *hash_location = NULL;
1148 	struct tcp_md5sig_key *hash_expected;
1149 	const struct iphdr *iph = ip_hdr(skb);
1150 	struct tcphdr *th = tcp_hdr(skb);
1151 	int genhash;
1152 	unsigned char newhash[16];
1153 
1154 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1155 	hash_location = tcp_parse_md5sig_option(th);
1156 
1157 	/* We've parsed the options - do we have a hash? */
1158 	if (!hash_expected && !hash_location)
1159 		return 0;
1160 
1161 	if (hash_expected && !hash_location) {
1162 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1163 		return 1;
1164 	}
1165 
1166 	if (!hash_expected && hash_location) {
1167 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1168 		return 1;
1169 	}
1170 
1171 	/* Okay, so this is hash_expected and hash_location -
1172 	 * so we need to calculate the checksum.
1173 	 */
1174 	genhash = tcp_v4_md5_hash_skb(newhash,
1175 				      hash_expected,
1176 				      NULL, NULL, skb);
1177 
1178 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1179 		if (net_ratelimit()) {
1180 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1181 			       &iph->saddr, ntohs(th->source),
1182 			       &iph->daddr, ntohs(th->dest),
1183 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1184 		}
1185 		return 1;
1186 	}
1187 	return 0;
1188 }
1189 
1190 #endif
1191 
1192 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1193 	.family		=	PF_INET,
1194 	.obj_size	=	sizeof(struct tcp_request_sock),
1195 	.rtx_syn_ack	=	tcp_v4_send_synack,
1196 	.send_ack	=	tcp_v4_reqsk_send_ack,
1197 	.destructor	=	tcp_v4_reqsk_destructor,
1198 	.send_reset	=	tcp_v4_send_reset,
1199 };
1200 
1201 #ifdef CONFIG_TCP_MD5SIG
1202 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1203 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1204 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1205 };
1206 #endif
1207 
1208 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1209 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1210 	.twsk_unique	= tcp_twsk_unique,
1211 	.twsk_destructor= tcp_twsk_destructor,
1212 };
1213 
1214 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1215 {
1216 	struct tcp_extend_values tmp_ext;
1217 	struct tcp_options_received tmp_opt;
1218 	u8 *hash_location;
1219 	struct request_sock *req;
1220 	struct inet_request_sock *ireq;
1221 	struct tcp_sock *tp = tcp_sk(sk);
1222 	struct dst_entry *dst = NULL;
1223 	__be32 saddr = ip_hdr(skb)->saddr;
1224 	__be32 daddr = ip_hdr(skb)->daddr;
1225 	__u32 isn = TCP_SKB_CB(skb)->when;
1226 #ifdef CONFIG_SYN_COOKIES
1227 	int want_cookie = 0;
1228 #else
1229 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1230 #endif
1231 
1232 	/* Never answer to SYNs send to broadcast or multicast */
1233 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1234 		goto drop;
1235 
1236 	/* TW buckets are converted to open requests without
1237 	 * limitations, they conserve resources and peer is
1238 	 * evidently real one.
1239 	 */
1240 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1241 #ifdef CONFIG_SYN_COOKIES
1242 		if (sysctl_tcp_syncookies) {
1243 			want_cookie = 1;
1244 		} else
1245 #endif
1246 		goto drop;
1247 	}
1248 
1249 	/* Accept backlog is full. If we have already queued enough
1250 	 * of warm entries in syn queue, drop request. It is better than
1251 	 * clogging syn queue with openreqs with exponentially increasing
1252 	 * timeout.
1253 	 */
1254 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1255 		goto drop;
1256 
1257 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1258 	if (!req)
1259 		goto drop;
1260 
1261 #ifdef CONFIG_TCP_MD5SIG
1262 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1263 #endif
1264 
1265 	tcp_clear_options(&tmp_opt);
1266 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1267 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1268 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1269 
1270 	if (tmp_opt.cookie_plus > 0 &&
1271 	    tmp_opt.saw_tstamp &&
1272 	    !tp->rx_opt.cookie_out_never &&
1273 	    (sysctl_tcp_cookie_size > 0 ||
1274 	     (tp->cookie_values != NULL &&
1275 	      tp->cookie_values->cookie_desired > 0))) {
1276 		u8 *c;
1277 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1278 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1279 
1280 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1281 			goto drop_and_release;
1282 
1283 		/* Secret recipe starts with IP addresses */
1284 		*mess++ ^= daddr;
1285 		*mess++ ^= saddr;
1286 
1287 		/* plus variable length Initiator Cookie */
1288 		c = (u8 *)mess;
1289 		while (l-- > 0)
1290 			*c++ ^= *hash_location++;
1291 
1292 #ifdef CONFIG_SYN_COOKIES
1293 		want_cookie = 0;	/* not our kind of cookie */
1294 #endif
1295 		tmp_ext.cookie_out_never = 0; /* false */
1296 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1297 	} else if (!tp->rx_opt.cookie_in_always) {
1298 		/* redundant indications, but ensure initialization. */
1299 		tmp_ext.cookie_out_never = 1; /* true */
1300 		tmp_ext.cookie_plus = 0;
1301 	} else {
1302 		goto drop_and_release;
1303 	}
1304 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1305 
1306 	if (want_cookie && !tmp_opt.saw_tstamp)
1307 		tcp_clear_options(&tmp_opt);
1308 
1309 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1310 	tcp_openreq_init(req, &tmp_opt, skb);
1311 
1312 	ireq = inet_rsk(req);
1313 	ireq->loc_addr = daddr;
1314 	ireq->rmt_addr = saddr;
1315 	ireq->no_srccheck = inet_sk(sk)->transparent;
1316 	ireq->opt = tcp_v4_save_options(sk, skb);
1317 
1318 	if (security_inet_conn_request(sk, skb, req))
1319 		goto drop_and_free;
1320 
1321 	if (!want_cookie)
1322 		TCP_ECN_create_request(req, tcp_hdr(skb));
1323 
1324 	if (want_cookie) {
1325 #ifdef CONFIG_SYN_COOKIES
1326 		syn_flood_warning(skb);
1327 		req->cookie_ts = tmp_opt.tstamp_ok;
1328 #endif
1329 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1330 	} else if (!isn) {
1331 		struct inet_peer *peer = NULL;
1332 
1333 		/* VJ's idea. We save last timestamp seen
1334 		 * from the destination in peer table, when entering
1335 		 * state TIME-WAIT, and check against it before
1336 		 * accepting new connection request.
1337 		 *
1338 		 * If "isn" is not zero, this request hit alive
1339 		 * timewait bucket, so that all the necessary checks
1340 		 * are made in the function processing timewait state.
1341 		 */
1342 		if (tmp_opt.saw_tstamp &&
1343 		    tcp_death_row.sysctl_tw_recycle &&
1344 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1345 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1346 		    peer->v4daddr == saddr) {
1347 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1348 			    (s32)(peer->tcp_ts - req->ts_recent) >
1349 							TCP_PAWS_WINDOW) {
1350 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1351 				goto drop_and_release;
1352 			}
1353 		}
1354 		/* Kill the following clause, if you dislike this way. */
1355 		else if (!sysctl_tcp_syncookies &&
1356 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1357 			  (sysctl_max_syn_backlog >> 2)) &&
1358 			 (!peer || !peer->tcp_ts_stamp) &&
1359 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1360 			/* Without syncookies last quarter of
1361 			 * backlog is filled with destinations,
1362 			 * proven to be alive.
1363 			 * It means that we continue to communicate
1364 			 * to destinations, already remembered
1365 			 * to the moment of synflood.
1366 			 */
1367 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1368 				       &saddr, ntohs(tcp_hdr(skb)->source));
1369 			goto drop_and_release;
1370 		}
1371 
1372 		isn = tcp_v4_init_sequence(skb);
1373 	}
1374 	tcp_rsk(req)->snt_isn = isn;
1375 
1376 	if (__tcp_v4_send_synack(sk, dst, req,
1377 				 (struct request_values *)&tmp_ext) ||
1378 	    want_cookie)
1379 		goto drop_and_free;
1380 
1381 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1382 	return 0;
1383 
1384 drop_and_release:
1385 	dst_release(dst);
1386 drop_and_free:
1387 	reqsk_free(req);
1388 drop:
1389 	return 0;
1390 }
1391 
1392 
1393 /*
1394  * The three way handshake has completed - we got a valid synack -
1395  * now create the new socket.
1396  */
1397 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1398 				  struct request_sock *req,
1399 				  struct dst_entry *dst)
1400 {
1401 	struct inet_request_sock *ireq;
1402 	struct inet_sock *newinet;
1403 	struct tcp_sock *newtp;
1404 	struct sock *newsk;
1405 #ifdef CONFIG_TCP_MD5SIG
1406 	struct tcp_md5sig_key *key;
1407 #endif
1408 
1409 	if (sk_acceptq_is_full(sk))
1410 		goto exit_overflow;
1411 
1412 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1413 		goto exit;
1414 
1415 	newsk = tcp_create_openreq_child(sk, req, skb);
1416 	if (!newsk)
1417 		goto exit;
1418 
1419 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1420 	sk_setup_caps(newsk, dst);
1421 
1422 	newtp		      = tcp_sk(newsk);
1423 	newinet		      = inet_sk(newsk);
1424 	ireq		      = inet_rsk(req);
1425 	newinet->inet_daddr   = ireq->rmt_addr;
1426 	newinet->inet_rcv_saddr = ireq->loc_addr;
1427 	newinet->inet_saddr	      = ireq->loc_addr;
1428 	newinet->opt	      = ireq->opt;
1429 	ireq->opt	      = NULL;
1430 	newinet->mc_index     = inet_iif(skb);
1431 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1432 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1433 	if (newinet->opt)
1434 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1435 	newinet->inet_id = newtp->write_seq ^ jiffies;
1436 
1437 	tcp_mtup_init(newsk);
1438 	tcp_sync_mss(newsk, dst_mtu(dst));
1439 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1440 	if (tcp_sk(sk)->rx_opt.user_mss &&
1441 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1442 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1443 
1444 	tcp_initialize_rcv_mss(newsk);
1445 
1446 #ifdef CONFIG_TCP_MD5SIG
1447 	/* Copy over the MD5 key from the original socket */
1448 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1449 	if (key != NULL) {
1450 		/*
1451 		 * We're using one, so create a matching key
1452 		 * on the newsk structure. If we fail to get
1453 		 * memory, then we end up not copying the key
1454 		 * across. Shucks.
1455 		 */
1456 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1457 		if (newkey != NULL)
1458 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1459 					  newkey, key->keylen);
1460 		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1461 	}
1462 #endif
1463 
1464 	__inet_hash_nolisten(newsk, NULL);
1465 	__inet_inherit_port(sk, newsk);
1466 
1467 	return newsk;
1468 
1469 exit_overflow:
1470 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1471 exit:
1472 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1473 	dst_release(dst);
1474 	return NULL;
1475 }
1476 
1477 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1478 {
1479 	struct tcphdr *th = tcp_hdr(skb);
1480 	const struct iphdr *iph = ip_hdr(skb);
1481 	struct sock *nsk;
1482 	struct request_sock **prev;
1483 	/* Find possible connection requests. */
1484 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1485 						       iph->saddr, iph->daddr);
1486 	if (req)
1487 		return tcp_check_req(sk, skb, req, prev);
1488 
1489 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1490 			th->source, iph->daddr, th->dest, inet_iif(skb));
1491 
1492 	if (nsk) {
1493 		if (nsk->sk_state != TCP_TIME_WAIT) {
1494 			bh_lock_sock(nsk);
1495 			return nsk;
1496 		}
1497 		inet_twsk_put(inet_twsk(nsk));
1498 		return NULL;
1499 	}
1500 
1501 #ifdef CONFIG_SYN_COOKIES
1502 	if (!th->rst && !th->syn && th->ack)
1503 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1504 #endif
1505 	return sk;
1506 }
1507 
1508 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1509 {
1510 	const struct iphdr *iph = ip_hdr(skb);
1511 
1512 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1513 		if (!tcp_v4_check(skb->len, iph->saddr,
1514 				  iph->daddr, skb->csum)) {
1515 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1516 			return 0;
1517 		}
1518 	}
1519 
1520 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1521 				       skb->len, IPPROTO_TCP, 0);
1522 
1523 	if (skb->len <= 76) {
1524 		return __skb_checksum_complete(skb);
1525 	}
1526 	return 0;
1527 }
1528 
1529 
1530 /* The socket must have it's spinlock held when we get
1531  * here.
1532  *
1533  * We have a potential double-lock case here, so even when
1534  * doing backlog processing we use the BH locking scheme.
1535  * This is because we cannot sleep with the original spinlock
1536  * held.
1537  */
1538 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1539 {
1540 	struct sock *rsk;
1541 #ifdef CONFIG_TCP_MD5SIG
1542 	/*
1543 	 * We really want to reject the packet as early as possible
1544 	 * if:
1545 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1546 	 *  o There is an MD5 option and we're not expecting one
1547 	 */
1548 	if (tcp_v4_inbound_md5_hash(sk, skb))
1549 		goto discard;
1550 #endif
1551 
1552 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1553 		TCP_CHECK_TIMER(sk);
1554 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1555 			rsk = sk;
1556 			goto reset;
1557 		}
1558 		TCP_CHECK_TIMER(sk);
1559 		return 0;
1560 	}
1561 
1562 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1563 		goto csum_err;
1564 
1565 	if (sk->sk_state == TCP_LISTEN) {
1566 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1567 		if (!nsk)
1568 			goto discard;
1569 
1570 		if (nsk != sk) {
1571 			if (tcp_child_process(sk, nsk, skb)) {
1572 				rsk = nsk;
1573 				goto reset;
1574 			}
1575 			return 0;
1576 		}
1577 	}
1578 
1579 	TCP_CHECK_TIMER(sk);
1580 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1581 		rsk = sk;
1582 		goto reset;
1583 	}
1584 	TCP_CHECK_TIMER(sk);
1585 	return 0;
1586 
1587 reset:
1588 	tcp_v4_send_reset(rsk, skb);
1589 discard:
1590 	kfree_skb(skb);
1591 	/* Be careful here. If this function gets more complicated and
1592 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1593 	 * might be destroyed here. This current version compiles correctly,
1594 	 * but you have been warned.
1595 	 */
1596 	return 0;
1597 
1598 csum_err:
1599 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1600 	goto discard;
1601 }
1602 
1603 /*
1604  *	From tcp_input.c
1605  */
1606 
1607 int tcp_v4_rcv(struct sk_buff *skb)
1608 {
1609 	const struct iphdr *iph;
1610 	struct tcphdr *th;
1611 	struct sock *sk;
1612 	int ret;
1613 	struct net *net = dev_net(skb->dev);
1614 
1615 	if (skb->pkt_type != PACKET_HOST)
1616 		goto discard_it;
1617 
1618 	/* Count it even if it's bad */
1619 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1620 
1621 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1622 		goto discard_it;
1623 
1624 	th = tcp_hdr(skb);
1625 
1626 	if (th->doff < sizeof(struct tcphdr) / 4)
1627 		goto bad_packet;
1628 	if (!pskb_may_pull(skb, th->doff * 4))
1629 		goto discard_it;
1630 
1631 	/* An explanation is required here, I think.
1632 	 * Packet length and doff are validated by header prediction,
1633 	 * provided case of th->doff==0 is eliminated.
1634 	 * So, we defer the checks. */
1635 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1636 		goto bad_packet;
1637 
1638 	th = tcp_hdr(skb);
1639 	iph = ip_hdr(skb);
1640 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1641 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1642 				    skb->len - th->doff * 4);
1643 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1644 	TCP_SKB_CB(skb)->when	 = 0;
1645 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1646 	TCP_SKB_CB(skb)->sacked	 = 0;
1647 
1648 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1649 	if (!sk)
1650 		goto no_tcp_socket;
1651 
1652 process:
1653 	if (sk->sk_state == TCP_TIME_WAIT)
1654 		goto do_time_wait;
1655 
1656 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1657 		goto discard_and_relse;
1658 	nf_reset(skb);
1659 
1660 	if (sk_filter(sk, skb))
1661 		goto discard_and_relse;
1662 
1663 	skb->dev = NULL;
1664 
1665 	bh_lock_sock_nested(sk);
1666 	ret = 0;
1667 	if (!sock_owned_by_user(sk)) {
1668 #ifdef CONFIG_NET_DMA
1669 		struct tcp_sock *tp = tcp_sk(sk);
1670 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1671 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1672 		if (tp->ucopy.dma_chan)
1673 			ret = tcp_v4_do_rcv(sk, skb);
1674 		else
1675 #endif
1676 		{
1677 			if (!tcp_prequeue(sk, skb))
1678 				ret = tcp_v4_do_rcv(sk, skb);
1679 		}
1680 	} else
1681 		sk_add_backlog(sk, skb);
1682 	bh_unlock_sock(sk);
1683 
1684 	sock_put(sk);
1685 
1686 	return ret;
1687 
1688 no_tcp_socket:
1689 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1690 		goto discard_it;
1691 
1692 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1693 bad_packet:
1694 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1695 	} else {
1696 		tcp_v4_send_reset(NULL, skb);
1697 	}
1698 
1699 discard_it:
1700 	/* Discard frame. */
1701 	kfree_skb(skb);
1702 	return 0;
1703 
1704 discard_and_relse:
1705 	sock_put(sk);
1706 	goto discard_it;
1707 
1708 do_time_wait:
1709 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1710 		inet_twsk_put(inet_twsk(sk));
1711 		goto discard_it;
1712 	}
1713 
1714 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1715 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1716 		inet_twsk_put(inet_twsk(sk));
1717 		goto discard_it;
1718 	}
1719 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1720 	case TCP_TW_SYN: {
1721 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1722 							&tcp_hashinfo,
1723 							iph->daddr, th->dest,
1724 							inet_iif(skb));
1725 		if (sk2) {
1726 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1727 			inet_twsk_put(inet_twsk(sk));
1728 			sk = sk2;
1729 			goto process;
1730 		}
1731 		/* Fall through to ACK */
1732 	}
1733 	case TCP_TW_ACK:
1734 		tcp_v4_timewait_ack(sk, skb);
1735 		break;
1736 	case TCP_TW_RST:
1737 		goto no_tcp_socket;
1738 	case TCP_TW_SUCCESS:;
1739 	}
1740 	goto discard_it;
1741 }
1742 
1743 /* VJ's idea. Save last timestamp seen from this destination
1744  * and hold it at least for normal timewait interval to use for duplicate
1745  * segment detection in subsequent connections, before they enter synchronized
1746  * state.
1747  */
1748 
1749 int tcp_v4_remember_stamp(struct sock *sk)
1750 {
1751 	struct inet_sock *inet = inet_sk(sk);
1752 	struct tcp_sock *tp = tcp_sk(sk);
1753 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1754 	struct inet_peer *peer = NULL;
1755 	int release_it = 0;
1756 
1757 	if (!rt || rt->rt_dst != inet->inet_daddr) {
1758 		peer = inet_getpeer(inet->inet_daddr, 1);
1759 		release_it = 1;
1760 	} else {
1761 		if (!rt->peer)
1762 			rt_bind_peer(rt, 1);
1763 		peer = rt->peer;
1764 	}
1765 
1766 	if (peer) {
1767 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1768 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1769 		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1770 			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1771 			peer->tcp_ts = tp->rx_opt.ts_recent;
1772 		}
1773 		if (release_it)
1774 			inet_putpeer(peer);
1775 		return 1;
1776 	}
1777 
1778 	return 0;
1779 }
1780 
1781 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1782 {
1783 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1784 
1785 	if (peer) {
1786 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1787 
1788 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1789 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1790 		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1791 			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1792 			peer->tcp_ts	   = tcptw->tw_ts_recent;
1793 		}
1794 		inet_putpeer(peer);
1795 		return 1;
1796 	}
1797 
1798 	return 0;
1799 }
1800 
1801 const struct inet_connection_sock_af_ops ipv4_specific = {
1802 	.queue_xmit	   = ip_queue_xmit,
1803 	.send_check	   = tcp_v4_send_check,
1804 	.rebuild_header	   = inet_sk_rebuild_header,
1805 	.conn_request	   = tcp_v4_conn_request,
1806 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1807 	.remember_stamp	   = tcp_v4_remember_stamp,
1808 	.net_header_len	   = sizeof(struct iphdr),
1809 	.setsockopt	   = ip_setsockopt,
1810 	.getsockopt	   = ip_getsockopt,
1811 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1812 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1813 	.bind_conflict	   = inet_csk_bind_conflict,
1814 #ifdef CONFIG_COMPAT
1815 	.compat_setsockopt = compat_ip_setsockopt,
1816 	.compat_getsockopt = compat_ip_getsockopt,
1817 #endif
1818 };
1819 
1820 #ifdef CONFIG_TCP_MD5SIG
1821 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1822 	.md5_lookup		= tcp_v4_md5_lookup,
1823 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1824 	.md5_add		= tcp_v4_md5_add_func,
1825 	.md5_parse		= tcp_v4_parse_md5_keys,
1826 };
1827 #endif
1828 
1829 /* NOTE: A lot of things set to zero explicitly by call to
1830  *       sk_alloc() so need not be done here.
1831  */
1832 static int tcp_v4_init_sock(struct sock *sk)
1833 {
1834 	struct inet_connection_sock *icsk = inet_csk(sk);
1835 	struct tcp_sock *tp = tcp_sk(sk);
1836 
1837 	skb_queue_head_init(&tp->out_of_order_queue);
1838 	tcp_init_xmit_timers(sk);
1839 	tcp_prequeue_init(tp);
1840 
1841 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1842 	tp->mdev = TCP_TIMEOUT_INIT;
1843 
1844 	/* So many TCP implementations out there (incorrectly) count the
1845 	 * initial SYN frame in their delayed-ACK and congestion control
1846 	 * algorithms that we must have the following bandaid to talk
1847 	 * efficiently to them.  -DaveM
1848 	 */
1849 	tp->snd_cwnd = 2;
1850 
1851 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1852 	 * initialization of these values.
1853 	 */
1854 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1855 	tp->snd_cwnd_clamp = ~0;
1856 	tp->mss_cache = TCP_MSS_DEFAULT;
1857 
1858 	tp->reordering = sysctl_tcp_reordering;
1859 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1860 
1861 	sk->sk_state = TCP_CLOSE;
1862 
1863 	sk->sk_write_space = sk_stream_write_space;
1864 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1865 
1866 	icsk->icsk_af_ops = &ipv4_specific;
1867 	icsk->icsk_sync_mss = tcp_sync_mss;
1868 #ifdef CONFIG_TCP_MD5SIG
1869 	tp->af_specific = &tcp_sock_ipv4_specific;
1870 #endif
1871 
1872 	/* TCP Cookie Transactions */
1873 	if (sysctl_tcp_cookie_size > 0) {
1874 		/* Default, cookies without s_data_payload. */
1875 		tp->cookie_values =
1876 			kzalloc(sizeof(*tp->cookie_values),
1877 				sk->sk_allocation);
1878 		if (tp->cookie_values != NULL)
1879 			kref_init(&tp->cookie_values->kref);
1880 	}
1881 	/* Presumed zeroed, in order of appearance:
1882 	 *	cookie_in_always, cookie_out_never,
1883 	 *	s_data_constant, s_data_in, s_data_out
1884 	 */
1885 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1886 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1887 
1888 	local_bh_disable();
1889 	percpu_counter_inc(&tcp_sockets_allocated);
1890 	local_bh_enable();
1891 
1892 	return 0;
1893 }
1894 
1895 void tcp_v4_destroy_sock(struct sock *sk)
1896 {
1897 	struct tcp_sock *tp = tcp_sk(sk);
1898 
1899 	tcp_clear_xmit_timers(sk);
1900 
1901 	tcp_cleanup_congestion_control(sk);
1902 
1903 	/* Cleanup up the write buffer. */
1904 	tcp_write_queue_purge(sk);
1905 
1906 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1907 	__skb_queue_purge(&tp->out_of_order_queue);
1908 
1909 #ifdef CONFIG_TCP_MD5SIG
1910 	/* Clean up the MD5 key list, if any */
1911 	if (tp->md5sig_info) {
1912 		tcp_v4_clear_md5_list(sk);
1913 		kfree(tp->md5sig_info);
1914 		tp->md5sig_info = NULL;
1915 	}
1916 #endif
1917 
1918 #ifdef CONFIG_NET_DMA
1919 	/* Cleans up our sk_async_wait_queue */
1920 	__skb_queue_purge(&sk->sk_async_wait_queue);
1921 #endif
1922 
1923 	/* Clean prequeue, it must be empty really */
1924 	__skb_queue_purge(&tp->ucopy.prequeue);
1925 
1926 	/* Clean up a referenced TCP bind bucket. */
1927 	if (inet_csk(sk)->icsk_bind_hash)
1928 		inet_put_port(sk);
1929 
1930 	/*
1931 	 * If sendmsg cached page exists, toss it.
1932 	 */
1933 	if (sk->sk_sndmsg_page) {
1934 		__free_page(sk->sk_sndmsg_page);
1935 		sk->sk_sndmsg_page = NULL;
1936 	}
1937 
1938 	/* TCP Cookie Transactions */
1939 	if (tp->cookie_values != NULL) {
1940 		kref_put(&tp->cookie_values->kref,
1941 			 tcp_cookie_values_release);
1942 		tp->cookie_values = NULL;
1943 	}
1944 
1945 	percpu_counter_dec(&tcp_sockets_allocated);
1946 }
1947 
1948 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1949 
1950 #ifdef CONFIG_PROC_FS
1951 /* Proc filesystem TCP sock list dumping. */
1952 
1953 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1954 {
1955 	return hlist_nulls_empty(head) ? NULL :
1956 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1957 }
1958 
1959 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1960 {
1961 	return !is_a_nulls(tw->tw_node.next) ?
1962 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1963 }
1964 
1965 static void *listening_get_next(struct seq_file *seq, void *cur)
1966 {
1967 	struct inet_connection_sock *icsk;
1968 	struct hlist_nulls_node *node;
1969 	struct sock *sk = cur;
1970 	struct inet_listen_hashbucket *ilb;
1971 	struct tcp_iter_state *st = seq->private;
1972 	struct net *net = seq_file_net(seq);
1973 
1974 	if (!sk) {
1975 		st->bucket = 0;
1976 		ilb = &tcp_hashinfo.listening_hash[0];
1977 		spin_lock_bh(&ilb->lock);
1978 		sk = sk_nulls_head(&ilb->head);
1979 		goto get_sk;
1980 	}
1981 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1982 	++st->num;
1983 
1984 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1985 		struct request_sock *req = cur;
1986 
1987 		icsk = inet_csk(st->syn_wait_sk);
1988 		req = req->dl_next;
1989 		while (1) {
1990 			while (req) {
1991 				if (req->rsk_ops->family == st->family) {
1992 					cur = req;
1993 					goto out;
1994 				}
1995 				req = req->dl_next;
1996 			}
1997 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1998 				break;
1999 get_req:
2000 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2001 		}
2002 		sk	  = sk_next(st->syn_wait_sk);
2003 		st->state = TCP_SEQ_STATE_LISTENING;
2004 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005 	} else {
2006 		icsk = inet_csk(sk);
2007 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2009 			goto start_req;
2010 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2011 		sk = sk_next(sk);
2012 	}
2013 get_sk:
2014 	sk_nulls_for_each_from(sk, node) {
2015 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2016 			cur = sk;
2017 			goto out;
2018 		}
2019 		icsk = inet_csk(sk);
2020 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2022 start_req:
2023 			st->uid		= sock_i_uid(sk);
2024 			st->syn_wait_sk = sk;
2025 			st->state	= TCP_SEQ_STATE_OPENREQ;
2026 			st->sbucket	= 0;
2027 			goto get_req;
2028 		}
2029 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2030 	}
2031 	spin_unlock_bh(&ilb->lock);
2032 	if (++st->bucket < INET_LHTABLE_SIZE) {
2033 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2034 		spin_lock_bh(&ilb->lock);
2035 		sk = sk_nulls_head(&ilb->head);
2036 		goto get_sk;
2037 	}
2038 	cur = NULL;
2039 out:
2040 	return cur;
2041 }
2042 
2043 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2044 {
2045 	void *rc = listening_get_next(seq, NULL);
2046 
2047 	while (rc && *pos) {
2048 		rc = listening_get_next(seq, rc);
2049 		--*pos;
2050 	}
2051 	return rc;
2052 }
2053 
2054 static inline int empty_bucket(struct tcp_iter_state *st)
2055 {
2056 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2057 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2058 }
2059 
2060 static void *established_get_first(struct seq_file *seq)
2061 {
2062 	struct tcp_iter_state *st = seq->private;
2063 	struct net *net = seq_file_net(seq);
2064 	void *rc = NULL;
2065 
2066 	for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2067 		struct sock *sk;
2068 		struct hlist_nulls_node *node;
2069 		struct inet_timewait_sock *tw;
2070 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2071 
2072 		/* Lockless fast path for the common case of empty buckets */
2073 		if (empty_bucket(st))
2074 			continue;
2075 
2076 		spin_lock_bh(lock);
2077 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2078 			if (sk->sk_family != st->family ||
2079 			    !net_eq(sock_net(sk), net)) {
2080 				continue;
2081 			}
2082 			rc = sk;
2083 			goto out;
2084 		}
2085 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2086 		inet_twsk_for_each(tw, node,
2087 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2088 			if (tw->tw_family != st->family ||
2089 			    !net_eq(twsk_net(tw), net)) {
2090 				continue;
2091 			}
2092 			rc = tw;
2093 			goto out;
2094 		}
2095 		spin_unlock_bh(lock);
2096 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2097 	}
2098 out:
2099 	return rc;
2100 }
2101 
2102 static void *established_get_next(struct seq_file *seq, void *cur)
2103 {
2104 	struct sock *sk = cur;
2105 	struct inet_timewait_sock *tw;
2106 	struct hlist_nulls_node *node;
2107 	struct tcp_iter_state *st = seq->private;
2108 	struct net *net = seq_file_net(seq);
2109 
2110 	++st->num;
2111 
2112 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2113 		tw = cur;
2114 		tw = tw_next(tw);
2115 get_tw:
2116 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2117 			tw = tw_next(tw);
2118 		}
2119 		if (tw) {
2120 			cur = tw;
2121 			goto out;
2122 		}
2123 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2124 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2125 
2126 		/* Look for next non empty bucket */
2127 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2128 				empty_bucket(st))
2129 			;
2130 		if (st->bucket > tcp_hashinfo.ehash_mask)
2131 			return NULL;
2132 
2133 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2134 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2135 	} else
2136 		sk = sk_nulls_next(sk);
2137 
2138 	sk_nulls_for_each_from(sk, node) {
2139 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2140 			goto found;
2141 	}
2142 
2143 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2144 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2145 	goto get_tw;
2146 found:
2147 	cur = sk;
2148 out:
2149 	return cur;
2150 }
2151 
2152 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2153 {
2154 	void *rc = established_get_first(seq);
2155 
2156 	while (rc && pos) {
2157 		rc = established_get_next(seq, rc);
2158 		--pos;
2159 	}
2160 	return rc;
2161 }
2162 
2163 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2164 {
2165 	void *rc;
2166 	struct tcp_iter_state *st = seq->private;
2167 
2168 	st->state = TCP_SEQ_STATE_LISTENING;
2169 	rc	  = listening_get_idx(seq, &pos);
2170 
2171 	if (!rc) {
2172 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2173 		rc	  = established_get_idx(seq, pos);
2174 	}
2175 
2176 	return rc;
2177 }
2178 
2179 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2180 {
2181 	struct tcp_iter_state *st = seq->private;
2182 	st->state = TCP_SEQ_STATE_LISTENING;
2183 	st->num = 0;
2184 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2185 }
2186 
2187 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2188 {
2189 	void *rc = NULL;
2190 	struct tcp_iter_state *st;
2191 
2192 	if (v == SEQ_START_TOKEN) {
2193 		rc = tcp_get_idx(seq, 0);
2194 		goto out;
2195 	}
2196 	st = seq->private;
2197 
2198 	switch (st->state) {
2199 	case TCP_SEQ_STATE_OPENREQ:
2200 	case TCP_SEQ_STATE_LISTENING:
2201 		rc = listening_get_next(seq, v);
2202 		if (!rc) {
2203 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2204 			rc	  = established_get_first(seq);
2205 		}
2206 		break;
2207 	case TCP_SEQ_STATE_ESTABLISHED:
2208 	case TCP_SEQ_STATE_TIME_WAIT:
2209 		rc = established_get_next(seq, v);
2210 		break;
2211 	}
2212 out:
2213 	++*pos;
2214 	return rc;
2215 }
2216 
2217 static void tcp_seq_stop(struct seq_file *seq, void *v)
2218 {
2219 	struct tcp_iter_state *st = seq->private;
2220 
2221 	switch (st->state) {
2222 	case TCP_SEQ_STATE_OPENREQ:
2223 		if (v) {
2224 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2225 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2226 		}
2227 	case TCP_SEQ_STATE_LISTENING:
2228 		if (v != SEQ_START_TOKEN)
2229 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2230 		break;
2231 	case TCP_SEQ_STATE_TIME_WAIT:
2232 	case TCP_SEQ_STATE_ESTABLISHED:
2233 		if (v)
2234 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2235 		break;
2236 	}
2237 }
2238 
2239 static int tcp_seq_open(struct inode *inode, struct file *file)
2240 {
2241 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2242 	struct tcp_iter_state *s;
2243 	int err;
2244 
2245 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2246 			  sizeof(struct tcp_iter_state));
2247 	if (err < 0)
2248 		return err;
2249 
2250 	s = ((struct seq_file *)file->private_data)->private;
2251 	s->family		= afinfo->family;
2252 	return 0;
2253 }
2254 
2255 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2256 {
2257 	int rc = 0;
2258 	struct proc_dir_entry *p;
2259 
2260 	afinfo->seq_fops.open		= tcp_seq_open;
2261 	afinfo->seq_fops.read		= seq_read;
2262 	afinfo->seq_fops.llseek		= seq_lseek;
2263 	afinfo->seq_fops.release	= seq_release_net;
2264 
2265 	afinfo->seq_ops.start		= tcp_seq_start;
2266 	afinfo->seq_ops.next		= tcp_seq_next;
2267 	afinfo->seq_ops.stop		= tcp_seq_stop;
2268 
2269 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2270 			     &afinfo->seq_fops, afinfo);
2271 	if (!p)
2272 		rc = -ENOMEM;
2273 	return rc;
2274 }
2275 
2276 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2277 {
2278 	proc_net_remove(net, afinfo->name);
2279 }
2280 
2281 static void get_openreq4(struct sock *sk, struct request_sock *req,
2282 			 struct seq_file *f, int i, int uid, int *len)
2283 {
2284 	const struct inet_request_sock *ireq = inet_rsk(req);
2285 	int ttd = req->expires - jiffies;
2286 
2287 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2288 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2289 		i,
2290 		ireq->loc_addr,
2291 		ntohs(inet_sk(sk)->inet_sport),
2292 		ireq->rmt_addr,
2293 		ntohs(ireq->rmt_port),
2294 		TCP_SYN_RECV,
2295 		0, 0, /* could print option size, but that is af dependent. */
2296 		1,    /* timers active (only the expire timer) */
2297 		jiffies_to_clock_t(ttd),
2298 		req->retrans,
2299 		uid,
2300 		0,  /* non standard timer */
2301 		0, /* open_requests have no inode */
2302 		atomic_read(&sk->sk_refcnt),
2303 		req,
2304 		len);
2305 }
2306 
2307 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2308 {
2309 	int timer_active;
2310 	unsigned long timer_expires;
2311 	struct tcp_sock *tp = tcp_sk(sk);
2312 	const struct inet_connection_sock *icsk = inet_csk(sk);
2313 	struct inet_sock *inet = inet_sk(sk);
2314 	__be32 dest = inet->inet_daddr;
2315 	__be32 src = inet->inet_rcv_saddr;
2316 	__u16 destp = ntohs(inet->inet_dport);
2317 	__u16 srcp = ntohs(inet->inet_sport);
2318 	int rx_queue;
2319 
2320 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2321 		timer_active	= 1;
2322 		timer_expires	= icsk->icsk_timeout;
2323 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2324 		timer_active	= 4;
2325 		timer_expires	= icsk->icsk_timeout;
2326 	} else if (timer_pending(&sk->sk_timer)) {
2327 		timer_active	= 2;
2328 		timer_expires	= sk->sk_timer.expires;
2329 	} else {
2330 		timer_active	= 0;
2331 		timer_expires = jiffies;
2332 	}
2333 
2334 	if (sk->sk_state == TCP_LISTEN)
2335 		rx_queue = sk->sk_ack_backlog;
2336 	else
2337 		/*
2338 		 * because we dont lock socket, we might find a transient negative value
2339 		 */
2340 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2341 
2342 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2343 			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2344 		i, src, srcp, dest, destp, sk->sk_state,
2345 		tp->write_seq - tp->snd_una,
2346 		rx_queue,
2347 		timer_active,
2348 		jiffies_to_clock_t(timer_expires - jiffies),
2349 		icsk->icsk_retransmits,
2350 		sock_i_uid(sk),
2351 		icsk->icsk_probes_out,
2352 		sock_i_ino(sk),
2353 		atomic_read(&sk->sk_refcnt), sk,
2354 		jiffies_to_clock_t(icsk->icsk_rto),
2355 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2356 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2357 		tp->snd_cwnd,
2358 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2359 		len);
2360 }
2361 
2362 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2363 			       struct seq_file *f, int i, int *len)
2364 {
2365 	__be32 dest, src;
2366 	__u16 destp, srcp;
2367 	int ttd = tw->tw_ttd - jiffies;
2368 
2369 	if (ttd < 0)
2370 		ttd = 0;
2371 
2372 	dest  = tw->tw_daddr;
2373 	src   = tw->tw_rcv_saddr;
2374 	destp = ntohs(tw->tw_dport);
2375 	srcp  = ntohs(tw->tw_sport);
2376 
2377 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2378 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2379 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2380 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2381 		atomic_read(&tw->tw_refcnt), tw, len);
2382 }
2383 
2384 #define TMPSZ 150
2385 
2386 static int tcp4_seq_show(struct seq_file *seq, void *v)
2387 {
2388 	struct tcp_iter_state *st;
2389 	int len;
2390 
2391 	if (v == SEQ_START_TOKEN) {
2392 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2393 			   "  sl  local_address rem_address   st tx_queue "
2394 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2395 			   "inode");
2396 		goto out;
2397 	}
2398 	st = seq->private;
2399 
2400 	switch (st->state) {
2401 	case TCP_SEQ_STATE_LISTENING:
2402 	case TCP_SEQ_STATE_ESTABLISHED:
2403 		get_tcp4_sock(v, seq, st->num, &len);
2404 		break;
2405 	case TCP_SEQ_STATE_OPENREQ:
2406 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2407 		break;
2408 	case TCP_SEQ_STATE_TIME_WAIT:
2409 		get_timewait4_sock(v, seq, st->num, &len);
2410 		break;
2411 	}
2412 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2413 out:
2414 	return 0;
2415 }
2416 
2417 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2418 	.name		= "tcp",
2419 	.family		= AF_INET,
2420 	.seq_fops	= {
2421 		.owner		= THIS_MODULE,
2422 	},
2423 	.seq_ops	= {
2424 		.show		= tcp4_seq_show,
2425 	},
2426 };
2427 
2428 static int tcp4_proc_init_net(struct net *net)
2429 {
2430 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2431 }
2432 
2433 static void tcp4_proc_exit_net(struct net *net)
2434 {
2435 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2436 }
2437 
2438 static struct pernet_operations tcp4_net_ops = {
2439 	.init = tcp4_proc_init_net,
2440 	.exit = tcp4_proc_exit_net,
2441 };
2442 
2443 int __init tcp4_proc_init(void)
2444 {
2445 	return register_pernet_subsys(&tcp4_net_ops);
2446 }
2447 
2448 void tcp4_proc_exit(void)
2449 {
2450 	unregister_pernet_subsys(&tcp4_net_ops);
2451 }
2452 #endif /* CONFIG_PROC_FS */
2453 
2454 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2455 {
2456 	struct iphdr *iph = skb_gro_network_header(skb);
2457 
2458 	switch (skb->ip_summed) {
2459 	case CHECKSUM_COMPLETE:
2460 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2461 				  skb->csum)) {
2462 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2463 			break;
2464 		}
2465 
2466 		/* fall through */
2467 	case CHECKSUM_NONE:
2468 		NAPI_GRO_CB(skb)->flush = 1;
2469 		return NULL;
2470 	}
2471 
2472 	return tcp_gro_receive(head, skb);
2473 }
2474 EXPORT_SYMBOL(tcp4_gro_receive);
2475 
2476 int tcp4_gro_complete(struct sk_buff *skb)
2477 {
2478 	struct iphdr *iph = ip_hdr(skb);
2479 	struct tcphdr *th = tcp_hdr(skb);
2480 
2481 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2482 				  iph->saddr, iph->daddr, 0);
2483 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2484 
2485 	return tcp_gro_complete(skb);
2486 }
2487 EXPORT_SYMBOL(tcp4_gro_complete);
2488 
2489 struct proto tcp_prot = {
2490 	.name			= "TCP",
2491 	.owner			= THIS_MODULE,
2492 	.close			= tcp_close,
2493 	.connect		= tcp_v4_connect,
2494 	.disconnect		= tcp_disconnect,
2495 	.accept			= inet_csk_accept,
2496 	.ioctl			= tcp_ioctl,
2497 	.init			= tcp_v4_init_sock,
2498 	.destroy		= tcp_v4_destroy_sock,
2499 	.shutdown		= tcp_shutdown,
2500 	.setsockopt		= tcp_setsockopt,
2501 	.getsockopt		= tcp_getsockopt,
2502 	.recvmsg		= tcp_recvmsg,
2503 	.backlog_rcv		= tcp_v4_do_rcv,
2504 	.hash			= inet_hash,
2505 	.unhash			= inet_unhash,
2506 	.get_port		= inet_csk_get_port,
2507 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2508 	.sockets_allocated	= &tcp_sockets_allocated,
2509 	.orphan_count		= &tcp_orphan_count,
2510 	.memory_allocated	= &tcp_memory_allocated,
2511 	.memory_pressure	= &tcp_memory_pressure,
2512 	.sysctl_mem		= sysctl_tcp_mem,
2513 	.sysctl_wmem		= sysctl_tcp_wmem,
2514 	.sysctl_rmem		= sysctl_tcp_rmem,
2515 	.max_header		= MAX_TCP_HEADER,
2516 	.obj_size		= sizeof(struct tcp_sock),
2517 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2518 	.twsk_prot		= &tcp_timewait_sock_ops,
2519 	.rsk_prot		= &tcp_request_sock_ops,
2520 	.h.hashinfo		= &tcp_hashinfo,
2521 #ifdef CONFIG_COMPAT
2522 	.compat_setsockopt	= compat_tcp_setsockopt,
2523 	.compat_getsockopt	= compat_tcp_getsockopt,
2524 #endif
2525 };
2526 
2527 
2528 static int __net_init tcp_sk_init(struct net *net)
2529 {
2530 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2531 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2532 }
2533 
2534 static void __net_exit tcp_sk_exit(struct net *net)
2535 {
2536 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2537 }
2538 
2539 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2540 {
2541 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2542 }
2543 
2544 static struct pernet_operations __net_initdata tcp_sk_ops = {
2545        .init	   = tcp_sk_init,
2546        .exit	   = tcp_sk_exit,
2547        .exit_batch = tcp_sk_exit_batch,
2548 };
2549 
2550 void __init tcp_v4_init(void)
2551 {
2552 	inet_hashinfo_init(&tcp_hashinfo);
2553 	if (register_pernet_subsys(&tcp_sk_ops))
2554 		panic("Failed to create the TCP control socket.\n");
2555 }
2556 
2557 EXPORT_SYMBOL(ipv4_specific);
2558 EXPORT_SYMBOL(tcp_hashinfo);
2559 EXPORT_SYMBOL(tcp_prot);
2560 EXPORT_SYMBOL(tcp_v4_conn_request);
2561 EXPORT_SYMBOL(tcp_v4_connect);
2562 EXPORT_SYMBOL(tcp_v4_do_rcv);
2563 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2564 EXPORT_SYMBOL(tcp_v4_send_check);
2565 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2566 
2567 #ifdef CONFIG_PROC_FS
2568 EXPORT_SYMBOL(tcp_proc_register);
2569 EXPORT_SYMBOL(tcp_proc_unregister);
2570 #endif
2571 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2572 
2573