xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 078073a3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
76 
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
82 
83 #include <linux/crypto.h>
84 #include <linux/scatterlist.h>
85 
86 int sysctl_tcp_tw_reuse __read_mostly;
87 int sysctl_tcp_low_latency __read_mostly;
88 EXPORT_SYMBOL(sysctl_tcp_low_latency);
89 
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
93 						   __be32 addr);
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
95 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
96 #else
97 static inline
98 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99 {
100 	return NULL;
101 }
102 #endif
103 
104 struct inet_hashinfo tcp_hashinfo;
105 EXPORT_SYMBOL(tcp_hashinfo);
106 
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
108 {
109 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
110 					  ip_hdr(skb)->saddr,
111 					  tcp_hdr(skb)->dest,
112 					  tcp_hdr(skb)->source);
113 }
114 
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118 	struct tcp_sock *tp = tcp_sk(sk);
119 
120 	/* With PAWS, it is safe from the viewpoint
121 	   of data integrity. Even without PAWS it is safe provided sequence
122 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
123 
124 	   Actually, the idea is close to VJ's one, only timestamp cache is
125 	   held not per host, but per port pair and TW bucket is used as state
126 	   holder.
127 
128 	   If TW bucket has been already destroyed we fall back to VJ's scheme
129 	   and use initial timestamp retrieved from peer table.
130 	 */
131 	if (tcptw->tw_ts_recent_stamp &&
132 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
133 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135 		if (tp->write_seq == 0)
136 			tp->write_seq = 1;
137 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
138 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139 		sock_hold(sktw);
140 		return 1;
141 	}
142 
143 	return 0;
144 }
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146 
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151 	struct inet_sock *inet = inet_sk(sk);
152 	struct tcp_sock *tp = tcp_sk(sk);
153 	__be16 orig_sport, orig_dport;
154 	__be32 daddr, nexthop;
155 	struct flowi4 *fl4;
156 	struct rtable *rt;
157 	int err;
158 	struct ip_options_rcu *inet_opt;
159 
160 	if (addr_len < sizeof(struct sockaddr_in))
161 		return -EINVAL;
162 
163 	if (usin->sin_family != AF_INET)
164 		return -EAFNOSUPPORT;
165 
166 	nexthop = daddr = usin->sin_addr.s_addr;
167 	inet_opt = rcu_dereference_protected(inet->inet_opt,
168 					     sock_owned_by_user(sk));
169 	if (inet_opt && inet_opt->opt.srr) {
170 		if (!daddr)
171 			return -EINVAL;
172 		nexthop = inet_opt->opt.faddr;
173 	}
174 
175 	orig_sport = inet->inet_sport;
176 	orig_dport = usin->sin_port;
177 	fl4 = &inet->cork.fl.u.ip4;
178 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
179 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
180 			      IPPROTO_TCP,
181 			      orig_sport, orig_dport, sk, true);
182 	if (IS_ERR(rt)) {
183 		err = PTR_ERR(rt);
184 		if (err == -ENETUNREACH)
185 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
186 		return err;
187 	}
188 
189 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
190 		ip_rt_put(rt);
191 		return -ENETUNREACH;
192 	}
193 
194 	if (!inet_opt || !inet_opt->opt.srr)
195 		daddr = fl4->daddr;
196 
197 	if (!inet->inet_saddr)
198 		inet->inet_saddr = fl4->saddr;
199 	inet->inet_rcv_saddr = inet->inet_saddr;
200 
201 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
202 		/* Reset inherited state */
203 		tp->rx_opt.ts_recent	   = 0;
204 		tp->rx_opt.ts_recent_stamp = 0;
205 		tp->write_seq		   = 0;
206 	}
207 
208 	if (tcp_death_row.sysctl_tw_recycle &&
209 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
210 		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
211 		/*
212 		 * VJ's idea. We save last timestamp seen from
213 		 * the destination in peer table, when entering state
214 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
215 		 * when trying new connection.
216 		 */
217 		if (peer) {
218 			inet_peer_refcheck(peer);
219 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
220 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
221 				tp->rx_opt.ts_recent = peer->tcp_ts;
222 			}
223 		}
224 	}
225 
226 	inet->inet_dport = usin->sin_port;
227 	inet->inet_daddr = daddr;
228 
229 	inet_csk(sk)->icsk_ext_hdr_len = 0;
230 	if (inet_opt)
231 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
232 
233 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
234 
235 	/* Socket identity is still unknown (sport may be zero).
236 	 * However we set state to SYN-SENT and not releasing socket
237 	 * lock select source port, enter ourselves into the hash tables and
238 	 * complete initialization after this.
239 	 */
240 	tcp_set_state(sk, TCP_SYN_SENT);
241 	err = inet_hash_connect(&tcp_death_row, sk);
242 	if (err)
243 		goto failure;
244 
245 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
246 			       inet->inet_sport, inet->inet_dport, sk);
247 	if (IS_ERR(rt)) {
248 		err = PTR_ERR(rt);
249 		rt = NULL;
250 		goto failure;
251 	}
252 	/* OK, now commit destination to socket.  */
253 	sk->sk_gso_type = SKB_GSO_TCPV4;
254 	sk_setup_caps(sk, &rt->dst);
255 
256 	if (!tp->write_seq)
257 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
258 							   inet->inet_daddr,
259 							   inet->inet_sport,
260 							   usin->sin_port);
261 
262 	inet->inet_id = tp->write_seq ^ jiffies;
263 
264 	err = tcp_connect(sk);
265 	rt = NULL;
266 	if (err)
267 		goto failure;
268 
269 	return 0;
270 
271 failure:
272 	/*
273 	 * This unhashes the socket and releases the local port,
274 	 * if necessary.
275 	 */
276 	tcp_set_state(sk, TCP_CLOSE);
277 	ip_rt_put(rt);
278 	sk->sk_route_caps = 0;
279 	inet->inet_dport = 0;
280 	return err;
281 }
282 EXPORT_SYMBOL(tcp_v4_connect);
283 
284 /*
285  * This routine does path mtu discovery as defined in RFC1191.
286  */
287 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
288 {
289 	struct dst_entry *dst;
290 	struct inet_sock *inet = inet_sk(sk);
291 
292 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
293 	 * send out by Linux are always <576bytes so they should go through
294 	 * unfragmented).
295 	 */
296 	if (sk->sk_state == TCP_LISTEN)
297 		return;
298 
299 	/* We don't check in the destentry if pmtu discovery is forbidden
300 	 * on this route. We just assume that no packet_to_big packets
301 	 * are send back when pmtu discovery is not active.
302 	 * There is a small race when the user changes this flag in the
303 	 * route, but I think that's acceptable.
304 	 */
305 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
306 		return;
307 
308 	dst->ops->update_pmtu(dst, mtu);
309 
310 	/* Something is about to be wrong... Remember soft error
311 	 * for the case, if this connection will not able to recover.
312 	 */
313 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314 		sk->sk_err_soft = EMSGSIZE;
315 
316 	mtu = dst_mtu(dst);
317 
318 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
320 		tcp_sync_mss(sk, mtu);
321 
322 		/* Resend the TCP packet because it's
323 		 * clear that the old packet has been
324 		 * dropped. This is the new "fast" path mtu
325 		 * discovery.
326 		 */
327 		tcp_simple_retransmit(sk);
328 	} /* else let the usual retransmit timer handle it */
329 }
330 
331 /*
332  * This routine is called by the ICMP module when it gets some
333  * sort of error condition.  If err < 0 then the socket should
334  * be closed and the error returned to the user.  If err > 0
335  * it's just the icmp type << 8 | icmp code.  After adjustment
336  * header points to the first 8 bytes of the tcp header.  We need
337  * to find the appropriate port.
338  *
339  * The locking strategy used here is very "optimistic". When
340  * someone else accesses the socket the ICMP is just dropped
341  * and for some paths there is no check at all.
342  * A more general error queue to queue errors for later handling
343  * is probably better.
344  *
345  */
346 
347 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
348 {
349 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
350 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
351 	struct inet_connection_sock *icsk;
352 	struct tcp_sock *tp;
353 	struct inet_sock *inet;
354 	const int type = icmp_hdr(icmp_skb)->type;
355 	const int code = icmp_hdr(icmp_skb)->code;
356 	struct sock *sk;
357 	struct sk_buff *skb;
358 	__u32 seq;
359 	__u32 remaining;
360 	int err;
361 	struct net *net = dev_net(icmp_skb->dev);
362 
363 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
364 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365 		return;
366 	}
367 
368 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
369 			iph->saddr, th->source, inet_iif(icmp_skb));
370 	if (!sk) {
371 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
372 		return;
373 	}
374 	if (sk->sk_state == TCP_TIME_WAIT) {
375 		inet_twsk_put(inet_twsk(sk));
376 		return;
377 	}
378 
379 	bh_lock_sock(sk);
380 	/* If too many ICMPs get dropped on busy
381 	 * servers this needs to be solved differently.
382 	 */
383 	if (sock_owned_by_user(sk))
384 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
385 
386 	if (sk->sk_state == TCP_CLOSE)
387 		goto out;
388 
389 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
390 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
391 		goto out;
392 	}
393 
394 	icsk = inet_csk(sk);
395 	tp = tcp_sk(sk);
396 	seq = ntohl(th->seq);
397 	if (sk->sk_state != TCP_LISTEN &&
398 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
399 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
400 		goto out;
401 	}
402 
403 	switch (type) {
404 	case ICMP_SOURCE_QUENCH:
405 		/* Just silently ignore these. */
406 		goto out;
407 	case ICMP_PARAMETERPROB:
408 		err = EPROTO;
409 		break;
410 	case ICMP_DEST_UNREACH:
411 		if (code > NR_ICMP_UNREACH)
412 			goto out;
413 
414 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
415 			if (!sock_owned_by_user(sk))
416 				do_pmtu_discovery(sk, iph, info);
417 			goto out;
418 		}
419 
420 		err = icmp_err_convert[code].errno;
421 		/* check if icmp_skb allows revert of backoff
422 		 * (see draft-zimmermann-tcp-lcd) */
423 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
424 			break;
425 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
426 		    !icsk->icsk_backoff)
427 			break;
428 
429 		if (sock_owned_by_user(sk))
430 			break;
431 
432 		icsk->icsk_backoff--;
433 		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
434 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
435 		tcp_bound_rto(sk);
436 
437 		skb = tcp_write_queue_head(sk);
438 		BUG_ON(!skb);
439 
440 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
441 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		goto out;
489 
490 	case TCP_SYN_SENT:
491 	case TCP_SYN_RECV:  /* Cannot happen.
492 			       It can f.e. if SYNs crossed.
493 			     */
494 		if (!sock_owned_by_user(sk)) {
495 			sk->sk_err = err;
496 
497 			sk->sk_error_report(sk);
498 
499 			tcp_done(sk);
500 		} else {
501 			sk->sk_err_soft = err;
502 		}
503 		goto out;
504 	}
505 
506 	/* If we've already connected we will keep trying
507 	 * until we time out, or the user gives up.
508 	 *
509 	 * rfc1122 4.2.3.9 allows to consider as hard errors
510 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
511 	 * but it is obsoleted by pmtu discovery).
512 	 *
513 	 * Note, that in modern internet, where routing is unreliable
514 	 * and in each dark corner broken firewalls sit, sending random
515 	 * errors ordered by their masters even this two messages finally lose
516 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
517 	 *
518 	 * Now we are in compliance with RFCs.
519 	 *							--ANK (980905)
520 	 */
521 
522 	inet = inet_sk(sk);
523 	if (!sock_owned_by_user(sk) && inet->recverr) {
524 		sk->sk_err = err;
525 		sk->sk_error_report(sk);
526 	} else	{ /* Only an error on timeout */
527 		sk->sk_err_soft = err;
528 	}
529 
530 out:
531 	bh_unlock_sock(sk);
532 	sock_put(sk);
533 }
534 
535 static void __tcp_v4_send_check(struct sk_buff *skb,
536 				__be32 saddr, __be32 daddr)
537 {
538 	struct tcphdr *th = tcp_hdr(skb);
539 
540 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
541 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
542 		skb->csum_start = skb_transport_header(skb) - skb->head;
543 		skb->csum_offset = offsetof(struct tcphdr, check);
544 	} else {
545 		th->check = tcp_v4_check(skb->len, saddr, daddr,
546 					 csum_partial(th,
547 						      th->doff << 2,
548 						      skb->csum));
549 	}
550 }
551 
552 /* This routine computes an IPv4 TCP checksum. */
553 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
554 {
555 	struct inet_sock *inet = inet_sk(sk);
556 
557 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
558 }
559 EXPORT_SYMBOL(tcp_v4_send_check);
560 
561 int tcp_v4_gso_send_check(struct sk_buff *skb)
562 {
563 	const struct iphdr *iph;
564 	struct tcphdr *th;
565 
566 	if (!pskb_may_pull(skb, sizeof(*th)))
567 		return -EINVAL;
568 
569 	iph = ip_hdr(skb);
570 	th = tcp_hdr(skb);
571 
572 	th->check = 0;
573 	skb->ip_summed = CHECKSUM_PARTIAL;
574 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
575 	return 0;
576 }
577 
578 /*
579  *	This routine will send an RST to the other tcp.
580  *
581  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
582  *		      for reset.
583  *	Answer: if a packet caused RST, it is not for a socket
584  *		existing in our system, if it is matched to a socket,
585  *		it is just duplicate segment or bug in other side's TCP.
586  *		So that we build reply only basing on parameters
587  *		arrived with segment.
588  *	Exception: precedence violation. We do not implement it in any case.
589  */
590 
591 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
592 {
593 	struct tcphdr *th = tcp_hdr(skb);
594 	struct {
595 		struct tcphdr th;
596 #ifdef CONFIG_TCP_MD5SIG
597 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
598 #endif
599 	} rep;
600 	struct ip_reply_arg arg;
601 #ifdef CONFIG_TCP_MD5SIG
602 	struct tcp_md5sig_key *key;
603 #endif
604 	struct net *net;
605 
606 	/* Never send a reset in response to a reset. */
607 	if (th->rst)
608 		return;
609 
610 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
611 		return;
612 
613 	/* Swap the send and the receive. */
614 	memset(&rep, 0, sizeof(rep));
615 	rep.th.dest   = th->source;
616 	rep.th.source = th->dest;
617 	rep.th.doff   = sizeof(struct tcphdr) / 4;
618 	rep.th.rst    = 1;
619 
620 	if (th->ack) {
621 		rep.th.seq = th->ack_seq;
622 	} else {
623 		rep.th.ack = 1;
624 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625 				       skb->len - (th->doff << 2));
626 	}
627 
628 	memset(&arg, 0, sizeof(arg));
629 	arg.iov[0].iov_base = (unsigned char *)&rep;
630 	arg.iov[0].iov_len  = sizeof(rep.th);
631 
632 #ifdef CONFIG_TCP_MD5SIG
633 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
634 	if (key) {
635 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
636 				   (TCPOPT_NOP << 16) |
637 				   (TCPOPT_MD5SIG << 8) |
638 				   TCPOLEN_MD5SIG);
639 		/* Update length and the length the header thinks exists */
640 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
641 		rep.th.doff = arg.iov[0].iov_len / 4;
642 
643 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
644 				     key, ip_hdr(skb)->saddr,
645 				     ip_hdr(skb)->daddr, &rep.th);
646 	}
647 #endif
648 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
649 				      ip_hdr(skb)->saddr, /* XXX */
650 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
651 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
652 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
653 
654 	net = dev_net(skb_dst(skb)->dev);
655 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
656 		      &arg, arg.iov[0].iov_len);
657 
658 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
659 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
660 }
661 
662 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
663    outside socket context is ugly, certainly. What can I do?
664  */
665 
666 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
667 			    u32 win, u32 ts, int oif,
668 			    struct tcp_md5sig_key *key,
669 			    int reply_flags)
670 {
671 	struct tcphdr *th = tcp_hdr(skb);
672 	struct {
673 		struct tcphdr th;
674 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
675 #ifdef CONFIG_TCP_MD5SIG
676 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
677 #endif
678 			];
679 	} rep;
680 	struct ip_reply_arg arg;
681 	struct net *net = dev_net(skb_dst(skb)->dev);
682 
683 	memset(&rep.th, 0, sizeof(struct tcphdr));
684 	memset(&arg, 0, sizeof(arg));
685 
686 	arg.iov[0].iov_base = (unsigned char *)&rep;
687 	arg.iov[0].iov_len  = sizeof(rep.th);
688 	if (ts) {
689 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
690 				   (TCPOPT_TIMESTAMP << 8) |
691 				   TCPOLEN_TIMESTAMP);
692 		rep.opt[1] = htonl(tcp_time_stamp);
693 		rep.opt[2] = htonl(ts);
694 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
695 	}
696 
697 	/* Swap the send and the receive. */
698 	rep.th.dest    = th->source;
699 	rep.th.source  = th->dest;
700 	rep.th.doff    = arg.iov[0].iov_len / 4;
701 	rep.th.seq     = htonl(seq);
702 	rep.th.ack_seq = htonl(ack);
703 	rep.th.ack     = 1;
704 	rep.th.window  = htons(win);
705 
706 #ifdef CONFIG_TCP_MD5SIG
707 	if (key) {
708 		int offset = (ts) ? 3 : 0;
709 
710 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
711 					  (TCPOPT_NOP << 16) |
712 					  (TCPOPT_MD5SIG << 8) |
713 					  TCPOLEN_MD5SIG);
714 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
715 		rep.th.doff = arg.iov[0].iov_len/4;
716 
717 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
718 				    key, ip_hdr(skb)->saddr,
719 				    ip_hdr(skb)->daddr, &rep.th);
720 	}
721 #endif
722 	arg.flags = reply_flags;
723 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
724 				      ip_hdr(skb)->saddr, /* XXX */
725 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
726 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
727 	if (oif)
728 		arg.bound_dev_if = oif;
729 
730 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
731 		      &arg, arg.iov[0].iov_len);
732 
733 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
734 }
735 
736 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
737 {
738 	struct inet_timewait_sock *tw = inet_twsk(sk);
739 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
740 
741 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
742 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
743 			tcptw->tw_ts_recent,
744 			tw->tw_bound_dev_if,
745 			tcp_twsk_md5_key(tcptw),
746 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
747 			);
748 
749 	inet_twsk_put(tw);
750 }
751 
752 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
753 				  struct request_sock *req)
754 {
755 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
756 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
757 			req->ts_recent,
758 			0,
759 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
760 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
761 }
762 
763 /*
764  *	Send a SYN-ACK after having received a SYN.
765  *	This still operates on a request_sock only, not on a big
766  *	socket.
767  */
768 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
769 			      struct request_sock *req,
770 			      struct request_values *rvp)
771 {
772 	const struct inet_request_sock *ireq = inet_rsk(req);
773 	struct flowi4 fl4;
774 	int err = -1;
775 	struct sk_buff * skb;
776 
777 	/* First, grab a route. */
778 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
779 		return -1;
780 
781 	skb = tcp_make_synack(sk, dst, req, rvp);
782 
783 	if (skb) {
784 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
785 
786 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
787 					    ireq->rmt_addr,
788 					    ireq->opt);
789 		err = net_xmit_eval(err);
790 	}
791 
792 	dst_release(dst);
793 	return err;
794 }
795 
796 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
797 			      struct request_values *rvp)
798 {
799 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
800 	return tcp_v4_send_synack(sk, NULL, req, rvp);
801 }
802 
803 /*
804  *	IPv4 request_sock destructor.
805  */
806 static void tcp_v4_reqsk_destructor(struct request_sock *req)
807 {
808 	kfree(inet_rsk(req)->opt);
809 }
810 
811 /*
812  * Return 1 if a syncookie should be sent
813  */
814 int tcp_syn_flood_action(struct sock *sk,
815 			 const struct sk_buff *skb,
816 			 const char *proto)
817 {
818 	const char *msg = "Dropping request";
819 	int want_cookie = 0;
820 	struct listen_sock *lopt;
821 
822 
823 
824 #ifdef CONFIG_SYN_COOKIES
825 	if (sysctl_tcp_syncookies) {
826 		msg = "Sending cookies";
827 		want_cookie = 1;
828 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
829 	} else
830 #endif
831 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
832 
833 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
834 	if (!lopt->synflood_warned) {
835 		lopt->synflood_warned = 1;
836 		pr_info("%s: Possible SYN flooding on port %d. %s. "
837 			" Check SNMP counters.\n",
838 			proto, ntohs(tcp_hdr(skb)->dest), msg);
839 	}
840 	return want_cookie;
841 }
842 EXPORT_SYMBOL(tcp_syn_flood_action);
843 
844 /*
845  * Save and compile IPv4 options into the request_sock if needed.
846  */
847 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
848 						  struct sk_buff *skb)
849 {
850 	const struct ip_options *opt = &(IPCB(skb)->opt);
851 	struct ip_options_rcu *dopt = NULL;
852 
853 	if (opt && opt->optlen) {
854 		int opt_size = sizeof(*dopt) + opt->optlen;
855 
856 		dopt = kmalloc(opt_size, GFP_ATOMIC);
857 		if (dopt) {
858 			if (ip_options_echo(&dopt->opt, skb)) {
859 				kfree(dopt);
860 				dopt = NULL;
861 			}
862 		}
863 	}
864 	return dopt;
865 }
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 /*
869  * RFC2385 MD5 checksumming requires a mapping of
870  * IP address->MD5 Key.
871  * We need to maintain these in the sk structure.
872  */
873 
874 /* Find the Key structure for an address.  */
875 static struct tcp_md5sig_key *
876 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
877 {
878 	struct tcp_sock *tp = tcp_sk(sk);
879 	int i;
880 
881 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
882 		return NULL;
883 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
884 		if (tp->md5sig_info->keys4[i].addr == addr)
885 			return &tp->md5sig_info->keys4[i].base;
886 	}
887 	return NULL;
888 }
889 
890 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
891 					 struct sock *addr_sk)
892 {
893 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
894 }
895 EXPORT_SYMBOL(tcp_v4_md5_lookup);
896 
897 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
898 						      struct request_sock *req)
899 {
900 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
901 }
902 
903 /* This can be called on a newly created socket, from other files */
904 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
905 		      u8 *newkey, u8 newkeylen)
906 {
907 	/* Add Key to the list */
908 	struct tcp_md5sig_key *key;
909 	struct tcp_sock *tp = tcp_sk(sk);
910 	struct tcp4_md5sig_key *keys;
911 
912 	key = tcp_v4_md5_do_lookup(sk, addr);
913 	if (key) {
914 		/* Pre-existing entry - just update that one. */
915 		kfree(key->key);
916 		key->key = newkey;
917 		key->keylen = newkeylen;
918 	} else {
919 		struct tcp_md5sig_info *md5sig;
920 
921 		if (!tp->md5sig_info) {
922 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
923 						  GFP_ATOMIC);
924 			if (!tp->md5sig_info) {
925 				kfree(newkey);
926 				return -ENOMEM;
927 			}
928 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
929 		}
930 
931 		md5sig = tp->md5sig_info;
932 		if (md5sig->entries4 == 0 &&
933 		    tcp_alloc_md5sig_pool(sk) == NULL) {
934 			kfree(newkey);
935 			return -ENOMEM;
936 		}
937 
938 		if (md5sig->alloced4 == md5sig->entries4) {
939 			keys = kmalloc((sizeof(*keys) *
940 					(md5sig->entries4 + 1)), GFP_ATOMIC);
941 			if (!keys) {
942 				kfree(newkey);
943 				if (md5sig->entries4 == 0)
944 					tcp_free_md5sig_pool();
945 				return -ENOMEM;
946 			}
947 
948 			if (md5sig->entries4)
949 				memcpy(keys, md5sig->keys4,
950 				       sizeof(*keys) * md5sig->entries4);
951 
952 			/* Free old key list, and reference new one */
953 			kfree(md5sig->keys4);
954 			md5sig->keys4 = keys;
955 			md5sig->alloced4++;
956 		}
957 		md5sig->entries4++;
958 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
959 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
960 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
961 	}
962 	return 0;
963 }
964 EXPORT_SYMBOL(tcp_v4_md5_do_add);
965 
966 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
967 			       u8 *newkey, u8 newkeylen)
968 {
969 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
970 				 newkey, newkeylen);
971 }
972 
973 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
974 {
975 	struct tcp_sock *tp = tcp_sk(sk);
976 	int i;
977 
978 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
979 		if (tp->md5sig_info->keys4[i].addr == addr) {
980 			/* Free the key */
981 			kfree(tp->md5sig_info->keys4[i].base.key);
982 			tp->md5sig_info->entries4--;
983 
984 			if (tp->md5sig_info->entries4 == 0) {
985 				kfree(tp->md5sig_info->keys4);
986 				tp->md5sig_info->keys4 = NULL;
987 				tp->md5sig_info->alloced4 = 0;
988 				tcp_free_md5sig_pool();
989 			} else if (tp->md5sig_info->entries4 != i) {
990 				/* Need to do some manipulation */
991 				memmove(&tp->md5sig_info->keys4[i],
992 					&tp->md5sig_info->keys4[i+1],
993 					(tp->md5sig_info->entries4 - i) *
994 					 sizeof(struct tcp4_md5sig_key));
995 			}
996 			return 0;
997 		}
998 	}
999 	return -ENOENT;
1000 }
1001 EXPORT_SYMBOL(tcp_v4_md5_do_del);
1002 
1003 static void tcp_v4_clear_md5_list(struct sock *sk)
1004 {
1005 	struct tcp_sock *tp = tcp_sk(sk);
1006 
1007 	/* Free each key, then the set of key keys,
1008 	 * the crypto element, and then decrement our
1009 	 * hold on the last resort crypto.
1010 	 */
1011 	if (tp->md5sig_info->entries4) {
1012 		int i;
1013 		for (i = 0; i < tp->md5sig_info->entries4; i++)
1014 			kfree(tp->md5sig_info->keys4[i].base.key);
1015 		tp->md5sig_info->entries4 = 0;
1016 		tcp_free_md5sig_pool();
1017 	}
1018 	if (tp->md5sig_info->keys4) {
1019 		kfree(tp->md5sig_info->keys4);
1020 		tp->md5sig_info->keys4 = NULL;
1021 		tp->md5sig_info->alloced4  = 0;
1022 	}
1023 }
1024 
1025 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1026 				 int optlen)
1027 {
1028 	struct tcp_md5sig cmd;
1029 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1030 	u8 *newkey;
1031 
1032 	if (optlen < sizeof(cmd))
1033 		return -EINVAL;
1034 
1035 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1036 		return -EFAULT;
1037 
1038 	if (sin->sin_family != AF_INET)
1039 		return -EINVAL;
1040 
1041 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1042 		if (!tcp_sk(sk)->md5sig_info)
1043 			return -ENOENT;
1044 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1045 	}
1046 
1047 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1048 		return -EINVAL;
1049 
1050 	if (!tcp_sk(sk)->md5sig_info) {
1051 		struct tcp_sock *tp = tcp_sk(sk);
1052 		struct tcp_md5sig_info *p;
1053 
1054 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1055 		if (!p)
1056 			return -EINVAL;
1057 
1058 		tp->md5sig_info = p;
1059 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1060 	}
1061 
1062 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1063 	if (!newkey)
1064 		return -ENOMEM;
1065 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1066 				 newkey, cmd.tcpm_keylen);
1067 }
1068 
1069 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1070 					__be32 daddr, __be32 saddr, int nbytes)
1071 {
1072 	struct tcp4_pseudohdr *bp;
1073 	struct scatterlist sg;
1074 
1075 	bp = &hp->md5_blk.ip4;
1076 
1077 	/*
1078 	 * 1. the TCP pseudo-header (in the order: source IP address,
1079 	 * destination IP address, zero-padded protocol number, and
1080 	 * segment length)
1081 	 */
1082 	bp->saddr = saddr;
1083 	bp->daddr = daddr;
1084 	bp->pad = 0;
1085 	bp->protocol = IPPROTO_TCP;
1086 	bp->len = cpu_to_be16(nbytes);
1087 
1088 	sg_init_one(&sg, bp, sizeof(*bp));
1089 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1090 }
1091 
1092 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1093 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1094 {
1095 	struct tcp_md5sig_pool *hp;
1096 	struct hash_desc *desc;
1097 
1098 	hp = tcp_get_md5sig_pool();
1099 	if (!hp)
1100 		goto clear_hash_noput;
1101 	desc = &hp->md5_desc;
1102 
1103 	if (crypto_hash_init(desc))
1104 		goto clear_hash;
1105 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1106 		goto clear_hash;
1107 	if (tcp_md5_hash_header(hp, th))
1108 		goto clear_hash;
1109 	if (tcp_md5_hash_key(hp, key))
1110 		goto clear_hash;
1111 	if (crypto_hash_final(desc, md5_hash))
1112 		goto clear_hash;
1113 
1114 	tcp_put_md5sig_pool();
1115 	return 0;
1116 
1117 clear_hash:
1118 	tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120 	memset(md5_hash, 0, 16);
1121 	return 1;
1122 }
1123 
1124 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1125 			struct sock *sk, struct request_sock *req,
1126 			struct sk_buff *skb)
1127 {
1128 	struct tcp_md5sig_pool *hp;
1129 	struct hash_desc *desc;
1130 	struct tcphdr *th = tcp_hdr(skb);
1131 	__be32 saddr, daddr;
1132 
1133 	if (sk) {
1134 		saddr = inet_sk(sk)->inet_saddr;
1135 		daddr = inet_sk(sk)->inet_daddr;
1136 	} else if (req) {
1137 		saddr = inet_rsk(req)->loc_addr;
1138 		daddr = inet_rsk(req)->rmt_addr;
1139 	} else {
1140 		const struct iphdr *iph = ip_hdr(skb);
1141 		saddr = iph->saddr;
1142 		daddr = iph->daddr;
1143 	}
1144 
1145 	hp = tcp_get_md5sig_pool();
1146 	if (!hp)
1147 		goto clear_hash_noput;
1148 	desc = &hp->md5_desc;
1149 
1150 	if (crypto_hash_init(desc))
1151 		goto clear_hash;
1152 
1153 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1154 		goto clear_hash;
1155 	if (tcp_md5_hash_header(hp, th))
1156 		goto clear_hash;
1157 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1158 		goto clear_hash;
1159 	if (tcp_md5_hash_key(hp, key))
1160 		goto clear_hash;
1161 	if (crypto_hash_final(desc, md5_hash))
1162 		goto clear_hash;
1163 
1164 	tcp_put_md5sig_pool();
1165 	return 0;
1166 
1167 clear_hash:
1168 	tcp_put_md5sig_pool();
1169 clear_hash_noput:
1170 	memset(md5_hash, 0, 16);
1171 	return 1;
1172 }
1173 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1174 
1175 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1176 {
1177 	/*
1178 	 * This gets called for each TCP segment that arrives
1179 	 * so we want to be efficient.
1180 	 * We have 3 drop cases:
1181 	 * o No MD5 hash and one expected.
1182 	 * o MD5 hash and we're not expecting one.
1183 	 * o MD5 hash and its wrong.
1184 	 */
1185 	__u8 *hash_location = NULL;
1186 	struct tcp_md5sig_key *hash_expected;
1187 	const struct iphdr *iph = ip_hdr(skb);
1188 	struct tcphdr *th = tcp_hdr(skb);
1189 	int genhash;
1190 	unsigned char newhash[16];
1191 
1192 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1193 	hash_location = tcp_parse_md5sig_option(th);
1194 
1195 	/* We've parsed the options - do we have a hash? */
1196 	if (!hash_expected && !hash_location)
1197 		return 0;
1198 
1199 	if (hash_expected && !hash_location) {
1200 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1201 		return 1;
1202 	}
1203 
1204 	if (!hash_expected && hash_location) {
1205 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1206 		return 1;
1207 	}
1208 
1209 	/* Okay, so this is hash_expected and hash_location -
1210 	 * so we need to calculate the checksum.
1211 	 */
1212 	genhash = tcp_v4_md5_hash_skb(newhash,
1213 				      hash_expected,
1214 				      NULL, NULL, skb);
1215 
1216 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1217 		if (net_ratelimit()) {
1218 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1219 			       &iph->saddr, ntohs(th->source),
1220 			       &iph->daddr, ntohs(th->dest),
1221 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1222 		}
1223 		return 1;
1224 	}
1225 	return 0;
1226 }
1227 
1228 #endif
1229 
1230 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1231 	.family		=	PF_INET,
1232 	.obj_size	=	sizeof(struct tcp_request_sock),
1233 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1234 	.send_ack	=	tcp_v4_reqsk_send_ack,
1235 	.destructor	=	tcp_v4_reqsk_destructor,
1236 	.send_reset	=	tcp_v4_send_reset,
1237 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1238 };
1239 
1240 #ifdef CONFIG_TCP_MD5SIG
1241 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1242 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1243 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1244 };
1245 #endif
1246 
1247 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1248 {
1249 	struct tcp_extend_values tmp_ext;
1250 	struct tcp_options_received tmp_opt;
1251 	u8 *hash_location;
1252 	struct request_sock *req;
1253 	struct inet_request_sock *ireq;
1254 	struct tcp_sock *tp = tcp_sk(sk);
1255 	struct dst_entry *dst = NULL;
1256 	__be32 saddr = ip_hdr(skb)->saddr;
1257 	__be32 daddr = ip_hdr(skb)->daddr;
1258 	__u32 isn = TCP_SKB_CB(skb)->when;
1259 	int want_cookie = 0;
1260 
1261 	/* Never answer to SYNs send to broadcast or multicast */
1262 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1263 		goto drop;
1264 
1265 	/* TW buckets are converted to open requests without
1266 	 * limitations, they conserve resources and peer is
1267 	 * evidently real one.
1268 	 */
1269 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1270 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1271 		if (!want_cookie)
1272 			goto drop;
1273 	}
1274 
1275 	/* Accept backlog is full. If we have already queued enough
1276 	 * of warm entries in syn queue, drop request. It is better than
1277 	 * clogging syn queue with openreqs with exponentially increasing
1278 	 * timeout.
1279 	 */
1280 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1281 		goto drop;
1282 
1283 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1284 	if (!req)
1285 		goto drop;
1286 
1287 #ifdef CONFIG_TCP_MD5SIG
1288 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1289 #endif
1290 
1291 	tcp_clear_options(&tmp_opt);
1292 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1293 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1294 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1295 
1296 	if (tmp_opt.cookie_plus > 0 &&
1297 	    tmp_opt.saw_tstamp &&
1298 	    !tp->rx_opt.cookie_out_never &&
1299 	    (sysctl_tcp_cookie_size > 0 ||
1300 	     (tp->cookie_values != NULL &&
1301 	      tp->cookie_values->cookie_desired > 0))) {
1302 		u8 *c;
1303 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1304 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1305 
1306 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1307 			goto drop_and_release;
1308 
1309 		/* Secret recipe starts with IP addresses */
1310 		*mess++ ^= (__force u32)daddr;
1311 		*mess++ ^= (__force u32)saddr;
1312 
1313 		/* plus variable length Initiator Cookie */
1314 		c = (u8 *)mess;
1315 		while (l-- > 0)
1316 			*c++ ^= *hash_location++;
1317 
1318 		want_cookie = 0;	/* not our kind of cookie */
1319 		tmp_ext.cookie_out_never = 0; /* false */
1320 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1321 	} else if (!tp->rx_opt.cookie_in_always) {
1322 		/* redundant indications, but ensure initialization. */
1323 		tmp_ext.cookie_out_never = 1; /* true */
1324 		tmp_ext.cookie_plus = 0;
1325 	} else {
1326 		goto drop_and_release;
1327 	}
1328 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1329 
1330 	if (want_cookie && !tmp_opt.saw_tstamp)
1331 		tcp_clear_options(&tmp_opt);
1332 
1333 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1334 	tcp_openreq_init(req, &tmp_opt, skb);
1335 
1336 	ireq = inet_rsk(req);
1337 	ireq->loc_addr = daddr;
1338 	ireq->rmt_addr = saddr;
1339 	ireq->no_srccheck = inet_sk(sk)->transparent;
1340 	ireq->opt = tcp_v4_save_options(sk, skb);
1341 
1342 	if (security_inet_conn_request(sk, skb, req))
1343 		goto drop_and_free;
1344 
1345 	if (!want_cookie || tmp_opt.tstamp_ok)
1346 		TCP_ECN_create_request(req, tcp_hdr(skb));
1347 
1348 	if (want_cookie) {
1349 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1350 		req->cookie_ts = tmp_opt.tstamp_ok;
1351 	} else if (!isn) {
1352 		struct inet_peer *peer = NULL;
1353 		struct flowi4 fl4;
1354 
1355 		/* VJ's idea. We save last timestamp seen
1356 		 * from the destination in peer table, when entering
1357 		 * state TIME-WAIT, and check against it before
1358 		 * accepting new connection request.
1359 		 *
1360 		 * If "isn" is not zero, this request hit alive
1361 		 * timewait bucket, so that all the necessary checks
1362 		 * are made in the function processing timewait state.
1363 		 */
1364 		if (tmp_opt.saw_tstamp &&
1365 		    tcp_death_row.sysctl_tw_recycle &&
1366 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1367 		    fl4.daddr == saddr &&
1368 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1369 			inet_peer_refcheck(peer);
1370 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1371 			    (s32)(peer->tcp_ts - req->ts_recent) >
1372 							TCP_PAWS_WINDOW) {
1373 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1374 				goto drop_and_release;
1375 			}
1376 		}
1377 		/* Kill the following clause, if you dislike this way. */
1378 		else if (!sysctl_tcp_syncookies &&
1379 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1380 			  (sysctl_max_syn_backlog >> 2)) &&
1381 			 (!peer || !peer->tcp_ts_stamp) &&
1382 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1383 			/* Without syncookies last quarter of
1384 			 * backlog is filled with destinations,
1385 			 * proven to be alive.
1386 			 * It means that we continue to communicate
1387 			 * to destinations, already remembered
1388 			 * to the moment of synflood.
1389 			 */
1390 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1391 				       &saddr, ntohs(tcp_hdr(skb)->source));
1392 			goto drop_and_release;
1393 		}
1394 
1395 		isn = tcp_v4_init_sequence(skb);
1396 	}
1397 	tcp_rsk(req)->snt_isn = isn;
1398 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1399 
1400 	if (tcp_v4_send_synack(sk, dst, req,
1401 			       (struct request_values *)&tmp_ext) ||
1402 	    want_cookie)
1403 		goto drop_and_free;
1404 
1405 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1406 	return 0;
1407 
1408 drop_and_release:
1409 	dst_release(dst);
1410 drop_and_free:
1411 	reqsk_free(req);
1412 drop:
1413 	return 0;
1414 }
1415 EXPORT_SYMBOL(tcp_v4_conn_request);
1416 
1417 
1418 /*
1419  * The three way handshake has completed - we got a valid synack -
1420  * now create the new socket.
1421  */
1422 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1423 				  struct request_sock *req,
1424 				  struct dst_entry *dst)
1425 {
1426 	struct inet_request_sock *ireq;
1427 	struct inet_sock *newinet;
1428 	struct tcp_sock *newtp;
1429 	struct sock *newsk;
1430 #ifdef CONFIG_TCP_MD5SIG
1431 	struct tcp_md5sig_key *key;
1432 #endif
1433 	struct ip_options_rcu *inet_opt;
1434 
1435 	if (sk_acceptq_is_full(sk))
1436 		goto exit_overflow;
1437 
1438 	newsk = tcp_create_openreq_child(sk, req, skb);
1439 	if (!newsk)
1440 		goto exit_nonewsk;
1441 
1442 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1443 
1444 	newtp		      = tcp_sk(newsk);
1445 	newinet		      = inet_sk(newsk);
1446 	ireq		      = inet_rsk(req);
1447 	newinet->inet_daddr   = ireq->rmt_addr;
1448 	newinet->inet_rcv_saddr = ireq->loc_addr;
1449 	newinet->inet_saddr	      = ireq->loc_addr;
1450 	inet_opt	      = ireq->opt;
1451 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1452 	ireq->opt	      = NULL;
1453 	newinet->mc_index     = inet_iif(skb);
1454 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1455 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1456 	if (inet_opt)
1457 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1458 	newinet->inet_id = newtp->write_seq ^ jiffies;
1459 
1460 	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1461 		goto put_and_exit;
1462 
1463 	sk_setup_caps(newsk, dst);
1464 
1465 	tcp_mtup_init(newsk);
1466 	tcp_sync_mss(newsk, dst_mtu(dst));
1467 	newtp->advmss = dst_metric_advmss(dst);
1468 	if (tcp_sk(sk)->rx_opt.user_mss &&
1469 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1470 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1471 
1472 	tcp_initialize_rcv_mss(newsk);
1473 	if (tcp_rsk(req)->snt_synack)
1474 		tcp_valid_rtt_meas(newsk,
1475 		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1476 	newtp->total_retrans = req->retrans;
1477 
1478 #ifdef CONFIG_TCP_MD5SIG
1479 	/* Copy over the MD5 key from the original socket */
1480 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1481 	if (key != NULL) {
1482 		/*
1483 		 * We're using one, so create a matching key
1484 		 * on the newsk structure. If we fail to get
1485 		 * memory, then we end up not copying the key
1486 		 * across. Shucks.
1487 		 */
1488 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1489 		if (newkey != NULL)
1490 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1491 					  newkey, key->keylen);
1492 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1493 	}
1494 #endif
1495 
1496 	if (__inet_inherit_port(sk, newsk) < 0)
1497 		goto put_and_exit;
1498 	__inet_hash_nolisten(newsk, NULL);
1499 
1500 	return newsk;
1501 
1502 exit_overflow:
1503 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1504 exit_nonewsk:
1505 	dst_release(dst);
1506 exit:
1507 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1508 	return NULL;
1509 put_and_exit:
1510 	sock_put(newsk);
1511 	goto exit;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1514 
1515 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1516 {
1517 	struct tcphdr *th = tcp_hdr(skb);
1518 	const struct iphdr *iph = ip_hdr(skb);
1519 	struct sock *nsk;
1520 	struct request_sock **prev;
1521 	/* Find possible connection requests. */
1522 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1523 						       iph->saddr, iph->daddr);
1524 	if (req)
1525 		return tcp_check_req(sk, skb, req, prev);
1526 
1527 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1528 			th->source, iph->daddr, th->dest, inet_iif(skb));
1529 
1530 	if (nsk) {
1531 		if (nsk->sk_state != TCP_TIME_WAIT) {
1532 			bh_lock_sock(nsk);
1533 			return nsk;
1534 		}
1535 		inet_twsk_put(inet_twsk(nsk));
1536 		return NULL;
1537 	}
1538 
1539 #ifdef CONFIG_SYN_COOKIES
1540 	if (!th->syn)
1541 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1542 #endif
1543 	return sk;
1544 }
1545 
1546 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1547 {
1548 	const struct iphdr *iph = ip_hdr(skb);
1549 
1550 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1551 		if (!tcp_v4_check(skb->len, iph->saddr,
1552 				  iph->daddr, skb->csum)) {
1553 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1554 			return 0;
1555 		}
1556 	}
1557 
1558 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1559 				       skb->len, IPPROTO_TCP, 0);
1560 
1561 	if (skb->len <= 76) {
1562 		return __skb_checksum_complete(skb);
1563 	}
1564 	return 0;
1565 }
1566 
1567 
1568 /* The socket must have it's spinlock held when we get
1569  * here.
1570  *
1571  * We have a potential double-lock case here, so even when
1572  * doing backlog processing we use the BH locking scheme.
1573  * This is because we cannot sleep with the original spinlock
1574  * held.
1575  */
1576 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1577 {
1578 	struct sock *rsk;
1579 #ifdef CONFIG_TCP_MD5SIG
1580 	/*
1581 	 * We really want to reject the packet as early as possible
1582 	 * if:
1583 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1584 	 *  o There is an MD5 option and we're not expecting one
1585 	 */
1586 	if (tcp_v4_inbound_md5_hash(sk, skb))
1587 		goto discard;
1588 #endif
1589 
1590 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1591 		sock_rps_save_rxhash(sk, skb->rxhash);
1592 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1593 			rsk = sk;
1594 			goto reset;
1595 		}
1596 		return 0;
1597 	}
1598 
1599 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1600 		goto csum_err;
1601 
1602 	if (sk->sk_state == TCP_LISTEN) {
1603 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1604 		if (!nsk)
1605 			goto discard;
1606 
1607 		if (nsk != sk) {
1608 			sock_rps_save_rxhash(nsk, skb->rxhash);
1609 			if (tcp_child_process(sk, nsk, skb)) {
1610 				rsk = nsk;
1611 				goto reset;
1612 			}
1613 			return 0;
1614 		}
1615 	} else
1616 		sock_rps_save_rxhash(sk, skb->rxhash);
1617 
1618 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1619 		rsk = sk;
1620 		goto reset;
1621 	}
1622 	return 0;
1623 
1624 reset:
1625 	tcp_v4_send_reset(rsk, skb);
1626 discard:
1627 	kfree_skb(skb);
1628 	/* Be careful here. If this function gets more complicated and
1629 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1630 	 * might be destroyed here. This current version compiles correctly,
1631 	 * but you have been warned.
1632 	 */
1633 	return 0;
1634 
1635 csum_err:
1636 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1637 	goto discard;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_do_rcv);
1640 
1641 /*
1642  *	From tcp_input.c
1643  */
1644 
1645 int tcp_v4_rcv(struct sk_buff *skb)
1646 {
1647 	const struct iphdr *iph;
1648 	struct tcphdr *th;
1649 	struct sock *sk;
1650 	int ret;
1651 	struct net *net = dev_net(skb->dev);
1652 
1653 	if (skb->pkt_type != PACKET_HOST)
1654 		goto discard_it;
1655 
1656 	/* Count it even if it's bad */
1657 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1658 
1659 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1660 		goto discard_it;
1661 
1662 	th = tcp_hdr(skb);
1663 
1664 	if (th->doff < sizeof(struct tcphdr) / 4)
1665 		goto bad_packet;
1666 	if (!pskb_may_pull(skb, th->doff * 4))
1667 		goto discard_it;
1668 
1669 	/* An explanation is required here, I think.
1670 	 * Packet length and doff are validated by header prediction,
1671 	 * provided case of th->doff==0 is eliminated.
1672 	 * So, we defer the checks. */
1673 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1674 		goto bad_packet;
1675 
1676 	th = tcp_hdr(skb);
1677 	iph = ip_hdr(skb);
1678 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1679 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1680 				    skb->len - th->doff * 4);
1681 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1682 	TCP_SKB_CB(skb)->when	 = 0;
1683 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1684 	TCP_SKB_CB(skb)->sacked	 = 0;
1685 
1686 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1687 	if (!sk)
1688 		goto no_tcp_socket;
1689 
1690 process:
1691 	if (sk->sk_state == TCP_TIME_WAIT)
1692 		goto do_time_wait;
1693 
1694 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1695 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1696 		goto discard_and_relse;
1697 	}
1698 
1699 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1700 		goto discard_and_relse;
1701 	nf_reset(skb);
1702 
1703 	if (sk_filter(sk, skb))
1704 		goto discard_and_relse;
1705 
1706 	skb->dev = NULL;
1707 
1708 	bh_lock_sock_nested(sk);
1709 	ret = 0;
1710 	if (!sock_owned_by_user(sk)) {
1711 #ifdef CONFIG_NET_DMA
1712 		struct tcp_sock *tp = tcp_sk(sk);
1713 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1714 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1715 		if (tp->ucopy.dma_chan)
1716 			ret = tcp_v4_do_rcv(sk, skb);
1717 		else
1718 #endif
1719 		{
1720 			if (!tcp_prequeue(sk, skb))
1721 				ret = tcp_v4_do_rcv(sk, skb);
1722 		}
1723 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1724 		bh_unlock_sock(sk);
1725 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1726 		goto discard_and_relse;
1727 	}
1728 	bh_unlock_sock(sk);
1729 
1730 	sock_put(sk);
1731 
1732 	return ret;
1733 
1734 no_tcp_socket:
1735 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1736 		goto discard_it;
1737 
1738 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1739 bad_packet:
1740 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1741 	} else {
1742 		tcp_v4_send_reset(NULL, skb);
1743 	}
1744 
1745 discard_it:
1746 	/* Discard frame. */
1747 	kfree_skb(skb);
1748 	return 0;
1749 
1750 discard_and_relse:
1751 	sock_put(sk);
1752 	goto discard_it;
1753 
1754 do_time_wait:
1755 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1756 		inet_twsk_put(inet_twsk(sk));
1757 		goto discard_it;
1758 	}
1759 
1760 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1761 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1762 		inet_twsk_put(inet_twsk(sk));
1763 		goto discard_it;
1764 	}
1765 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1766 	case TCP_TW_SYN: {
1767 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1768 							&tcp_hashinfo,
1769 							iph->daddr, th->dest,
1770 							inet_iif(skb));
1771 		if (sk2) {
1772 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1773 			inet_twsk_put(inet_twsk(sk));
1774 			sk = sk2;
1775 			goto process;
1776 		}
1777 		/* Fall through to ACK */
1778 	}
1779 	case TCP_TW_ACK:
1780 		tcp_v4_timewait_ack(sk, skb);
1781 		break;
1782 	case TCP_TW_RST:
1783 		goto no_tcp_socket;
1784 	case TCP_TW_SUCCESS:;
1785 	}
1786 	goto discard_it;
1787 }
1788 
1789 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1790 {
1791 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1792 	struct inet_sock *inet = inet_sk(sk);
1793 	struct inet_peer *peer;
1794 
1795 	if (!rt ||
1796 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1797 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1798 		*release_it = true;
1799 	} else {
1800 		if (!rt->peer)
1801 			rt_bind_peer(rt, inet->inet_daddr, 1);
1802 		peer = rt->peer;
1803 		*release_it = false;
1804 	}
1805 
1806 	return peer;
1807 }
1808 EXPORT_SYMBOL(tcp_v4_get_peer);
1809 
1810 void *tcp_v4_tw_get_peer(struct sock *sk)
1811 {
1812 	struct inet_timewait_sock *tw = inet_twsk(sk);
1813 
1814 	return inet_getpeer_v4(tw->tw_daddr, 1);
1815 }
1816 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1817 
1818 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1819 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1820 	.twsk_unique	= tcp_twsk_unique,
1821 	.twsk_destructor= tcp_twsk_destructor,
1822 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1823 };
1824 
1825 const struct inet_connection_sock_af_ops ipv4_specific = {
1826 	.queue_xmit	   = ip_queue_xmit,
1827 	.send_check	   = tcp_v4_send_check,
1828 	.rebuild_header	   = inet_sk_rebuild_header,
1829 	.conn_request	   = tcp_v4_conn_request,
1830 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1831 	.get_peer	   = tcp_v4_get_peer,
1832 	.net_header_len	   = sizeof(struct iphdr),
1833 	.setsockopt	   = ip_setsockopt,
1834 	.getsockopt	   = ip_getsockopt,
1835 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1836 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1837 	.bind_conflict	   = inet_csk_bind_conflict,
1838 #ifdef CONFIG_COMPAT
1839 	.compat_setsockopt = compat_ip_setsockopt,
1840 	.compat_getsockopt = compat_ip_getsockopt,
1841 #endif
1842 };
1843 EXPORT_SYMBOL(ipv4_specific);
1844 
1845 #ifdef CONFIG_TCP_MD5SIG
1846 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1847 	.md5_lookup		= tcp_v4_md5_lookup,
1848 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1849 	.md5_add		= tcp_v4_md5_add_func,
1850 	.md5_parse		= tcp_v4_parse_md5_keys,
1851 };
1852 #endif
1853 
1854 /* NOTE: A lot of things set to zero explicitly by call to
1855  *       sk_alloc() so need not be done here.
1856  */
1857 static int tcp_v4_init_sock(struct sock *sk)
1858 {
1859 	struct inet_connection_sock *icsk = inet_csk(sk);
1860 	struct tcp_sock *tp = tcp_sk(sk);
1861 
1862 	skb_queue_head_init(&tp->out_of_order_queue);
1863 	tcp_init_xmit_timers(sk);
1864 	tcp_prequeue_init(tp);
1865 
1866 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1867 	tp->mdev = TCP_TIMEOUT_INIT;
1868 
1869 	/* So many TCP implementations out there (incorrectly) count the
1870 	 * initial SYN frame in their delayed-ACK and congestion control
1871 	 * algorithms that we must have the following bandaid to talk
1872 	 * efficiently to them.  -DaveM
1873 	 */
1874 	tp->snd_cwnd = TCP_INIT_CWND;
1875 
1876 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1877 	 * initialization of these values.
1878 	 */
1879 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1880 	tp->snd_cwnd_clamp = ~0;
1881 	tp->mss_cache = TCP_MSS_DEFAULT;
1882 
1883 	tp->reordering = sysctl_tcp_reordering;
1884 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1885 
1886 	sk->sk_state = TCP_CLOSE;
1887 
1888 	sk->sk_write_space = sk_stream_write_space;
1889 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1890 
1891 	icsk->icsk_af_ops = &ipv4_specific;
1892 	icsk->icsk_sync_mss = tcp_sync_mss;
1893 #ifdef CONFIG_TCP_MD5SIG
1894 	tp->af_specific = &tcp_sock_ipv4_specific;
1895 #endif
1896 
1897 	/* TCP Cookie Transactions */
1898 	if (sysctl_tcp_cookie_size > 0) {
1899 		/* Default, cookies without s_data_payload. */
1900 		tp->cookie_values =
1901 			kzalloc(sizeof(*tp->cookie_values),
1902 				sk->sk_allocation);
1903 		if (tp->cookie_values != NULL)
1904 			kref_init(&tp->cookie_values->kref);
1905 	}
1906 	/* Presumed zeroed, in order of appearance:
1907 	 *	cookie_in_always, cookie_out_never,
1908 	 *	s_data_constant, s_data_in, s_data_out
1909 	 */
1910 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1911 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1912 
1913 	local_bh_disable();
1914 	percpu_counter_inc(&tcp_sockets_allocated);
1915 	local_bh_enable();
1916 
1917 	return 0;
1918 }
1919 
1920 void tcp_v4_destroy_sock(struct sock *sk)
1921 {
1922 	struct tcp_sock *tp = tcp_sk(sk);
1923 
1924 	tcp_clear_xmit_timers(sk);
1925 
1926 	tcp_cleanup_congestion_control(sk);
1927 
1928 	/* Cleanup up the write buffer. */
1929 	tcp_write_queue_purge(sk);
1930 
1931 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1932 	__skb_queue_purge(&tp->out_of_order_queue);
1933 
1934 #ifdef CONFIG_TCP_MD5SIG
1935 	/* Clean up the MD5 key list, if any */
1936 	if (tp->md5sig_info) {
1937 		tcp_v4_clear_md5_list(sk);
1938 		kfree(tp->md5sig_info);
1939 		tp->md5sig_info = NULL;
1940 	}
1941 #endif
1942 
1943 #ifdef CONFIG_NET_DMA
1944 	/* Cleans up our sk_async_wait_queue */
1945 	__skb_queue_purge(&sk->sk_async_wait_queue);
1946 #endif
1947 
1948 	/* Clean prequeue, it must be empty really */
1949 	__skb_queue_purge(&tp->ucopy.prequeue);
1950 
1951 	/* Clean up a referenced TCP bind bucket. */
1952 	if (inet_csk(sk)->icsk_bind_hash)
1953 		inet_put_port(sk);
1954 
1955 	/*
1956 	 * If sendmsg cached page exists, toss it.
1957 	 */
1958 	if (sk->sk_sndmsg_page) {
1959 		__free_page(sk->sk_sndmsg_page);
1960 		sk->sk_sndmsg_page = NULL;
1961 	}
1962 
1963 	/* TCP Cookie Transactions */
1964 	if (tp->cookie_values != NULL) {
1965 		kref_put(&tp->cookie_values->kref,
1966 			 tcp_cookie_values_release);
1967 		tp->cookie_values = NULL;
1968 	}
1969 
1970 	percpu_counter_dec(&tcp_sockets_allocated);
1971 }
1972 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1973 
1974 #ifdef CONFIG_PROC_FS
1975 /* Proc filesystem TCP sock list dumping. */
1976 
1977 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1978 {
1979 	return hlist_nulls_empty(head) ? NULL :
1980 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1981 }
1982 
1983 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1984 {
1985 	return !is_a_nulls(tw->tw_node.next) ?
1986 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1987 }
1988 
1989 /*
1990  * Get next listener socket follow cur.  If cur is NULL, get first socket
1991  * starting from bucket given in st->bucket; when st->bucket is zero the
1992  * very first socket in the hash table is returned.
1993  */
1994 static void *listening_get_next(struct seq_file *seq, void *cur)
1995 {
1996 	struct inet_connection_sock *icsk;
1997 	struct hlist_nulls_node *node;
1998 	struct sock *sk = cur;
1999 	struct inet_listen_hashbucket *ilb;
2000 	struct tcp_iter_state *st = seq->private;
2001 	struct net *net = seq_file_net(seq);
2002 
2003 	if (!sk) {
2004 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2005 		spin_lock_bh(&ilb->lock);
2006 		sk = sk_nulls_head(&ilb->head);
2007 		st->offset = 0;
2008 		goto get_sk;
2009 	}
2010 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2011 	++st->num;
2012 	++st->offset;
2013 
2014 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2015 		struct request_sock *req = cur;
2016 
2017 		icsk = inet_csk(st->syn_wait_sk);
2018 		req = req->dl_next;
2019 		while (1) {
2020 			while (req) {
2021 				if (req->rsk_ops->family == st->family) {
2022 					cur = req;
2023 					goto out;
2024 				}
2025 				req = req->dl_next;
2026 			}
2027 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2028 				break;
2029 get_req:
2030 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2031 		}
2032 		sk	  = sk_nulls_next(st->syn_wait_sk);
2033 		st->state = TCP_SEQ_STATE_LISTENING;
2034 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2035 	} else {
2036 		icsk = inet_csk(sk);
2037 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2038 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2039 			goto start_req;
2040 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2041 		sk = sk_nulls_next(sk);
2042 	}
2043 get_sk:
2044 	sk_nulls_for_each_from(sk, node) {
2045 		if (!net_eq(sock_net(sk), net))
2046 			continue;
2047 		if (sk->sk_family == st->family) {
2048 			cur = sk;
2049 			goto out;
2050 		}
2051 		icsk = inet_csk(sk);
2052 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2053 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2054 start_req:
2055 			st->uid		= sock_i_uid(sk);
2056 			st->syn_wait_sk = sk;
2057 			st->state	= TCP_SEQ_STATE_OPENREQ;
2058 			st->sbucket	= 0;
2059 			goto get_req;
2060 		}
2061 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2062 	}
2063 	spin_unlock_bh(&ilb->lock);
2064 	st->offset = 0;
2065 	if (++st->bucket < INET_LHTABLE_SIZE) {
2066 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2067 		spin_lock_bh(&ilb->lock);
2068 		sk = sk_nulls_head(&ilb->head);
2069 		goto get_sk;
2070 	}
2071 	cur = NULL;
2072 out:
2073 	return cur;
2074 }
2075 
2076 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2077 {
2078 	struct tcp_iter_state *st = seq->private;
2079 	void *rc;
2080 
2081 	st->bucket = 0;
2082 	st->offset = 0;
2083 	rc = listening_get_next(seq, NULL);
2084 
2085 	while (rc && *pos) {
2086 		rc = listening_get_next(seq, rc);
2087 		--*pos;
2088 	}
2089 	return rc;
2090 }
2091 
2092 static inline int empty_bucket(struct tcp_iter_state *st)
2093 {
2094 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2095 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2096 }
2097 
2098 /*
2099  * Get first established socket starting from bucket given in st->bucket.
2100  * If st->bucket is zero, the very first socket in the hash is returned.
2101  */
2102 static void *established_get_first(struct seq_file *seq)
2103 {
2104 	struct tcp_iter_state *st = seq->private;
2105 	struct net *net = seq_file_net(seq);
2106 	void *rc = NULL;
2107 
2108 	st->offset = 0;
2109 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2110 		struct sock *sk;
2111 		struct hlist_nulls_node *node;
2112 		struct inet_timewait_sock *tw;
2113 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2114 
2115 		/* Lockless fast path for the common case of empty buckets */
2116 		if (empty_bucket(st))
2117 			continue;
2118 
2119 		spin_lock_bh(lock);
2120 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2121 			if (sk->sk_family != st->family ||
2122 			    !net_eq(sock_net(sk), net)) {
2123 				continue;
2124 			}
2125 			rc = sk;
2126 			goto out;
2127 		}
2128 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2129 		inet_twsk_for_each(tw, node,
2130 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2131 			if (tw->tw_family != st->family ||
2132 			    !net_eq(twsk_net(tw), net)) {
2133 				continue;
2134 			}
2135 			rc = tw;
2136 			goto out;
2137 		}
2138 		spin_unlock_bh(lock);
2139 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2140 	}
2141 out:
2142 	return rc;
2143 }
2144 
2145 static void *established_get_next(struct seq_file *seq, void *cur)
2146 {
2147 	struct sock *sk = cur;
2148 	struct inet_timewait_sock *tw;
2149 	struct hlist_nulls_node *node;
2150 	struct tcp_iter_state *st = seq->private;
2151 	struct net *net = seq_file_net(seq);
2152 
2153 	++st->num;
2154 	++st->offset;
2155 
2156 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2157 		tw = cur;
2158 		tw = tw_next(tw);
2159 get_tw:
2160 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2161 			tw = tw_next(tw);
2162 		}
2163 		if (tw) {
2164 			cur = tw;
2165 			goto out;
2166 		}
2167 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2168 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2169 
2170 		/* Look for next non empty bucket */
2171 		st->offset = 0;
2172 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2173 				empty_bucket(st))
2174 			;
2175 		if (st->bucket > tcp_hashinfo.ehash_mask)
2176 			return NULL;
2177 
2178 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2179 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2180 	} else
2181 		sk = sk_nulls_next(sk);
2182 
2183 	sk_nulls_for_each_from(sk, node) {
2184 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2185 			goto found;
2186 	}
2187 
2188 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2189 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2190 	goto get_tw;
2191 found:
2192 	cur = sk;
2193 out:
2194 	return cur;
2195 }
2196 
2197 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2198 {
2199 	struct tcp_iter_state *st = seq->private;
2200 	void *rc;
2201 
2202 	st->bucket = 0;
2203 	rc = established_get_first(seq);
2204 
2205 	while (rc && pos) {
2206 		rc = established_get_next(seq, rc);
2207 		--pos;
2208 	}
2209 	return rc;
2210 }
2211 
2212 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2213 {
2214 	void *rc;
2215 	struct tcp_iter_state *st = seq->private;
2216 
2217 	st->state = TCP_SEQ_STATE_LISTENING;
2218 	rc	  = listening_get_idx(seq, &pos);
2219 
2220 	if (!rc) {
2221 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 		rc	  = established_get_idx(seq, pos);
2223 	}
2224 
2225 	return rc;
2226 }
2227 
2228 static void *tcp_seek_last_pos(struct seq_file *seq)
2229 {
2230 	struct tcp_iter_state *st = seq->private;
2231 	int offset = st->offset;
2232 	int orig_num = st->num;
2233 	void *rc = NULL;
2234 
2235 	switch (st->state) {
2236 	case TCP_SEQ_STATE_OPENREQ:
2237 	case TCP_SEQ_STATE_LISTENING:
2238 		if (st->bucket >= INET_LHTABLE_SIZE)
2239 			break;
2240 		st->state = TCP_SEQ_STATE_LISTENING;
2241 		rc = listening_get_next(seq, NULL);
2242 		while (offset-- && rc)
2243 			rc = listening_get_next(seq, rc);
2244 		if (rc)
2245 			break;
2246 		st->bucket = 0;
2247 		/* Fallthrough */
2248 	case TCP_SEQ_STATE_ESTABLISHED:
2249 	case TCP_SEQ_STATE_TIME_WAIT:
2250 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2251 		if (st->bucket > tcp_hashinfo.ehash_mask)
2252 			break;
2253 		rc = established_get_first(seq);
2254 		while (offset-- && rc)
2255 			rc = established_get_next(seq, rc);
2256 	}
2257 
2258 	st->num = orig_num;
2259 
2260 	return rc;
2261 }
2262 
2263 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2264 {
2265 	struct tcp_iter_state *st = seq->private;
2266 	void *rc;
2267 
2268 	if (*pos && *pos == st->last_pos) {
2269 		rc = tcp_seek_last_pos(seq);
2270 		if (rc)
2271 			goto out;
2272 	}
2273 
2274 	st->state = TCP_SEQ_STATE_LISTENING;
2275 	st->num = 0;
2276 	st->bucket = 0;
2277 	st->offset = 0;
2278 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2279 
2280 out:
2281 	st->last_pos = *pos;
2282 	return rc;
2283 }
2284 
2285 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2286 {
2287 	struct tcp_iter_state *st = seq->private;
2288 	void *rc = NULL;
2289 
2290 	if (v == SEQ_START_TOKEN) {
2291 		rc = tcp_get_idx(seq, 0);
2292 		goto out;
2293 	}
2294 
2295 	switch (st->state) {
2296 	case TCP_SEQ_STATE_OPENREQ:
2297 	case TCP_SEQ_STATE_LISTENING:
2298 		rc = listening_get_next(seq, v);
2299 		if (!rc) {
2300 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2301 			st->bucket = 0;
2302 			st->offset = 0;
2303 			rc	  = established_get_first(seq);
2304 		}
2305 		break;
2306 	case TCP_SEQ_STATE_ESTABLISHED:
2307 	case TCP_SEQ_STATE_TIME_WAIT:
2308 		rc = established_get_next(seq, v);
2309 		break;
2310 	}
2311 out:
2312 	++*pos;
2313 	st->last_pos = *pos;
2314 	return rc;
2315 }
2316 
2317 static void tcp_seq_stop(struct seq_file *seq, void *v)
2318 {
2319 	struct tcp_iter_state *st = seq->private;
2320 
2321 	switch (st->state) {
2322 	case TCP_SEQ_STATE_OPENREQ:
2323 		if (v) {
2324 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2325 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2326 		}
2327 	case TCP_SEQ_STATE_LISTENING:
2328 		if (v != SEQ_START_TOKEN)
2329 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2330 		break;
2331 	case TCP_SEQ_STATE_TIME_WAIT:
2332 	case TCP_SEQ_STATE_ESTABLISHED:
2333 		if (v)
2334 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2335 		break;
2336 	}
2337 }
2338 
2339 static int tcp_seq_open(struct inode *inode, struct file *file)
2340 {
2341 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2342 	struct tcp_iter_state *s;
2343 	int err;
2344 
2345 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2346 			  sizeof(struct tcp_iter_state));
2347 	if (err < 0)
2348 		return err;
2349 
2350 	s = ((struct seq_file *)file->private_data)->private;
2351 	s->family		= afinfo->family;
2352 	s->last_pos 		= 0;
2353 	return 0;
2354 }
2355 
2356 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2357 {
2358 	int rc = 0;
2359 	struct proc_dir_entry *p;
2360 
2361 	afinfo->seq_fops.open		= tcp_seq_open;
2362 	afinfo->seq_fops.read		= seq_read;
2363 	afinfo->seq_fops.llseek		= seq_lseek;
2364 	afinfo->seq_fops.release	= seq_release_net;
2365 
2366 	afinfo->seq_ops.start		= tcp_seq_start;
2367 	afinfo->seq_ops.next		= tcp_seq_next;
2368 	afinfo->seq_ops.stop		= tcp_seq_stop;
2369 
2370 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2371 			     &afinfo->seq_fops, afinfo);
2372 	if (!p)
2373 		rc = -ENOMEM;
2374 	return rc;
2375 }
2376 EXPORT_SYMBOL(tcp_proc_register);
2377 
2378 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2379 {
2380 	proc_net_remove(net, afinfo->name);
2381 }
2382 EXPORT_SYMBOL(tcp_proc_unregister);
2383 
2384 static void get_openreq4(struct sock *sk, struct request_sock *req,
2385 			 struct seq_file *f, int i, int uid, int *len)
2386 {
2387 	const struct inet_request_sock *ireq = inet_rsk(req);
2388 	int ttd = req->expires - jiffies;
2389 
2390 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2391 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2392 		i,
2393 		ireq->loc_addr,
2394 		ntohs(inet_sk(sk)->inet_sport),
2395 		ireq->rmt_addr,
2396 		ntohs(ireq->rmt_port),
2397 		TCP_SYN_RECV,
2398 		0, 0, /* could print option size, but that is af dependent. */
2399 		1,    /* timers active (only the expire timer) */
2400 		jiffies_to_clock_t(ttd),
2401 		req->retrans,
2402 		uid,
2403 		0,  /* non standard timer */
2404 		0, /* open_requests have no inode */
2405 		atomic_read(&sk->sk_refcnt),
2406 		req,
2407 		len);
2408 }
2409 
2410 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2411 {
2412 	int timer_active;
2413 	unsigned long timer_expires;
2414 	struct tcp_sock *tp = tcp_sk(sk);
2415 	const struct inet_connection_sock *icsk = inet_csk(sk);
2416 	struct inet_sock *inet = inet_sk(sk);
2417 	__be32 dest = inet->inet_daddr;
2418 	__be32 src = inet->inet_rcv_saddr;
2419 	__u16 destp = ntohs(inet->inet_dport);
2420 	__u16 srcp = ntohs(inet->inet_sport);
2421 	int rx_queue;
2422 
2423 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2424 		timer_active	= 1;
2425 		timer_expires	= icsk->icsk_timeout;
2426 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2427 		timer_active	= 4;
2428 		timer_expires	= icsk->icsk_timeout;
2429 	} else if (timer_pending(&sk->sk_timer)) {
2430 		timer_active	= 2;
2431 		timer_expires	= sk->sk_timer.expires;
2432 	} else {
2433 		timer_active	= 0;
2434 		timer_expires = jiffies;
2435 	}
2436 
2437 	if (sk->sk_state == TCP_LISTEN)
2438 		rx_queue = sk->sk_ack_backlog;
2439 	else
2440 		/*
2441 		 * because we dont lock socket, we might find a transient negative value
2442 		 */
2443 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2444 
2445 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2446 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2447 		i, src, srcp, dest, destp, sk->sk_state,
2448 		tp->write_seq - tp->snd_una,
2449 		rx_queue,
2450 		timer_active,
2451 		jiffies_to_clock_t(timer_expires - jiffies),
2452 		icsk->icsk_retransmits,
2453 		sock_i_uid(sk),
2454 		icsk->icsk_probes_out,
2455 		sock_i_ino(sk),
2456 		atomic_read(&sk->sk_refcnt), sk,
2457 		jiffies_to_clock_t(icsk->icsk_rto),
2458 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2459 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2460 		tp->snd_cwnd,
2461 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2462 		len);
2463 }
2464 
2465 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2466 			       struct seq_file *f, int i, int *len)
2467 {
2468 	__be32 dest, src;
2469 	__u16 destp, srcp;
2470 	int ttd = tw->tw_ttd - jiffies;
2471 
2472 	if (ttd < 0)
2473 		ttd = 0;
2474 
2475 	dest  = tw->tw_daddr;
2476 	src   = tw->tw_rcv_saddr;
2477 	destp = ntohs(tw->tw_dport);
2478 	srcp  = ntohs(tw->tw_sport);
2479 
2480 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2481 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2482 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2483 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2484 		atomic_read(&tw->tw_refcnt), tw, len);
2485 }
2486 
2487 #define TMPSZ 150
2488 
2489 static int tcp4_seq_show(struct seq_file *seq, void *v)
2490 {
2491 	struct tcp_iter_state *st;
2492 	int len;
2493 
2494 	if (v == SEQ_START_TOKEN) {
2495 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2496 			   "  sl  local_address rem_address   st tx_queue "
2497 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2498 			   "inode");
2499 		goto out;
2500 	}
2501 	st = seq->private;
2502 
2503 	switch (st->state) {
2504 	case TCP_SEQ_STATE_LISTENING:
2505 	case TCP_SEQ_STATE_ESTABLISHED:
2506 		get_tcp4_sock(v, seq, st->num, &len);
2507 		break;
2508 	case TCP_SEQ_STATE_OPENREQ:
2509 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2510 		break;
2511 	case TCP_SEQ_STATE_TIME_WAIT:
2512 		get_timewait4_sock(v, seq, st->num, &len);
2513 		break;
2514 	}
2515 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2516 out:
2517 	return 0;
2518 }
2519 
2520 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2521 	.name		= "tcp",
2522 	.family		= AF_INET,
2523 	.seq_fops	= {
2524 		.owner		= THIS_MODULE,
2525 	},
2526 	.seq_ops	= {
2527 		.show		= tcp4_seq_show,
2528 	},
2529 };
2530 
2531 static int __net_init tcp4_proc_init_net(struct net *net)
2532 {
2533 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2534 }
2535 
2536 static void __net_exit tcp4_proc_exit_net(struct net *net)
2537 {
2538 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2539 }
2540 
2541 static struct pernet_operations tcp4_net_ops = {
2542 	.init = tcp4_proc_init_net,
2543 	.exit = tcp4_proc_exit_net,
2544 };
2545 
2546 int __init tcp4_proc_init(void)
2547 {
2548 	return register_pernet_subsys(&tcp4_net_ops);
2549 }
2550 
2551 void tcp4_proc_exit(void)
2552 {
2553 	unregister_pernet_subsys(&tcp4_net_ops);
2554 }
2555 #endif /* CONFIG_PROC_FS */
2556 
2557 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2558 {
2559 	const struct iphdr *iph = skb_gro_network_header(skb);
2560 
2561 	switch (skb->ip_summed) {
2562 	case CHECKSUM_COMPLETE:
2563 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2564 				  skb->csum)) {
2565 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2566 			break;
2567 		}
2568 
2569 		/* fall through */
2570 	case CHECKSUM_NONE:
2571 		NAPI_GRO_CB(skb)->flush = 1;
2572 		return NULL;
2573 	}
2574 
2575 	return tcp_gro_receive(head, skb);
2576 }
2577 
2578 int tcp4_gro_complete(struct sk_buff *skb)
2579 {
2580 	const struct iphdr *iph = ip_hdr(skb);
2581 	struct tcphdr *th = tcp_hdr(skb);
2582 
2583 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2584 				  iph->saddr, iph->daddr, 0);
2585 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2586 
2587 	return tcp_gro_complete(skb);
2588 }
2589 
2590 struct proto tcp_prot = {
2591 	.name			= "TCP",
2592 	.owner			= THIS_MODULE,
2593 	.close			= tcp_close,
2594 	.connect		= tcp_v4_connect,
2595 	.disconnect		= tcp_disconnect,
2596 	.accept			= inet_csk_accept,
2597 	.ioctl			= tcp_ioctl,
2598 	.init			= tcp_v4_init_sock,
2599 	.destroy		= tcp_v4_destroy_sock,
2600 	.shutdown		= tcp_shutdown,
2601 	.setsockopt		= tcp_setsockopt,
2602 	.getsockopt		= tcp_getsockopt,
2603 	.recvmsg		= tcp_recvmsg,
2604 	.sendmsg		= tcp_sendmsg,
2605 	.sendpage		= tcp_sendpage,
2606 	.backlog_rcv		= tcp_v4_do_rcv,
2607 	.hash			= inet_hash,
2608 	.unhash			= inet_unhash,
2609 	.get_port		= inet_csk_get_port,
2610 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2611 	.sockets_allocated	= &tcp_sockets_allocated,
2612 	.orphan_count		= &tcp_orphan_count,
2613 	.memory_allocated	= &tcp_memory_allocated,
2614 	.memory_pressure	= &tcp_memory_pressure,
2615 	.sysctl_mem		= sysctl_tcp_mem,
2616 	.sysctl_wmem		= sysctl_tcp_wmem,
2617 	.sysctl_rmem		= sysctl_tcp_rmem,
2618 	.max_header		= MAX_TCP_HEADER,
2619 	.obj_size		= sizeof(struct tcp_sock),
2620 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2621 	.twsk_prot		= &tcp_timewait_sock_ops,
2622 	.rsk_prot		= &tcp_request_sock_ops,
2623 	.h.hashinfo		= &tcp_hashinfo,
2624 	.no_autobind		= true,
2625 #ifdef CONFIG_COMPAT
2626 	.compat_setsockopt	= compat_tcp_setsockopt,
2627 	.compat_getsockopt	= compat_tcp_getsockopt,
2628 #endif
2629 };
2630 EXPORT_SYMBOL(tcp_prot);
2631 
2632 
2633 static int __net_init tcp_sk_init(struct net *net)
2634 {
2635 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2636 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2637 }
2638 
2639 static void __net_exit tcp_sk_exit(struct net *net)
2640 {
2641 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2642 }
2643 
2644 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2645 {
2646 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2647 }
2648 
2649 static struct pernet_operations __net_initdata tcp_sk_ops = {
2650        .init	   = tcp_sk_init,
2651        .exit	   = tcp_sk_exit,
2652        .exit_batch = tcp_sk_exit_batch,
2653 };
2654 
2655 void __init tcp_v4_init(void)
2656 {
2657 	inet_hashinfo_init(&tcp_hashinfo);
2658 	if (register_pernet_subsys(&tcp_sk_ops))
2659 		panic("Failed to create the TCP control socket.\n");
2660 }
2661