xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision a8fe58ce)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     sock_owned_by_user(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	WARN_ON(req->sk);
323 
324 	if (seq != tcp_rsk(req)->snt_isn) {
325 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326 	} else {
327 		/*
328 		 * Still in SYN_RECV, just remove it silently.
329 		 * There is no good way to pass the error to the newly
330 		 * created socket, and POSIX does not want network
331 		 * errors returned from accept().
332 		 */
333 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334 		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335 	}
336 	reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339 
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355 
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360 	struct inet_connection_sock *icsk;
361 	struct tcp_sock *tp;
362 	struct inet_sock *inet;
363 	const int type = icmp_hdr(icmp_skb)->type;
364 	const int code = icmp_hdr(icmp_skb)->code;
365 	struct sock *sk;
366 	struct sk_buff *skb;
367 	struct request_sock *fastopen;
368 	__u32 seq, snd_una;
369 	__u32 remaining;
370 	int err;
371 	struct net *net = dev_net(icmp_skb->dev);
372 
373 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 				       th->dest, iph->saddr, ntohs(th->source),
375 				       inet_iif(icmp_skb));
376 	if (!sk) {
377 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378 		return;
379 	}
380 	if (sk->sk_state == TCP_TIME_WAIT) {
381 		inet_twsk_put(inet_twsk(sk));
382 		return;
383 	}
384 	seq = ntohl(th->seq);
385 	if (sk->sk_state == TCP_NEW_SYN_RECV)
386 		return tcp_req_err(sk, seq);
387 
388 	bh_lock_sock(sk);
389 	/* If too many ICMPs get dropped on busy
390 	 * servers this needs to be solved differently.
391 	 * We do take care of PMTU discovery (RFC1191) special case :
392 	 * we can receive locally generated ICMP messages while socket is held.
393 	 */
394 	if (sock_owned_by_user(sk)) {
395 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
396 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
397 	}
398 	if (sk->sk_state == TCP_CLOSE)
399 		goto out;
400 
401 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
402 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
403 		goto out;
404 	}
405 
406 	icsk = inet_csk(sk);
407 	tp = tcp_sk(sk);
408 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
409 	fastopen = tp->fastopen_rsk;
410 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
411 	if (sk->sk_state != TCP_LISTEN &&
412 	    !between(seq, snd_una, tp->snd_nxt)) {
413 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
414 		goto out;
415 	}
416 
417 	switch (type) {
418 	case ICMP_REDIRECT:
419 		do_redirect(icmp_skb, sk);
420 		goto out;
421 	case ICMP_SOURCE_QUENCH:
422 		/* Just silently ignore these. */
423 		goto out;
424 	case ICMP_PARAMETERPROB:
425 		err = EPROTO;
426 		break;
427 	case ICMP_DEST_UNREACH:
428 		if (code > NR_ICMP_UNREACH)
429 			goto out;
430 
431 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
432 			/* We are not interested in TCP_LISTEN and open_requests
433 			 * (SYN-ACKs send out by Linux are always <576bytes so
434 			 * they should go through unfragmented).
435 			 */
436 			if (sk->sk_state == TCP_LISTEN)
437 				goto out;
438 
439 			tp->mtu_info = info;
440 			if (!sock_owned_by_user(sk)) {
441 				tcp_v4_mtu_reduced(sk);
442 			} else {
443 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
444 					sock_hold(sk);
445 			}
446 			goto out;
447 		}
448 
449 		err = icmp_err_convert[code].errno;
450 		/* check if icmp_skb allows revert of backoff
451 		 * (see draft-zimmermann-tcp-lcd) */
452 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
453 			break;
454 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 		    !icsk->icsk_backoff || fastopen)
456 			break;
457 
458 		if (sock_owned_by_user(sk))
459 			break;
460 
461 		icsk->icsk_backoff--;
462 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
463 					       TCP_TIMEOUT_INIT;
464 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
465 
466 		skb = tcp_write_queue_head(sk);
467 		BUG_ON(!skb);
468 
469 		remaining = icsk->icsk_rto -
470 			    min(icsk->icsk_rto,
471 				tcp_time_stamp - tcp_skb_timestamp(skb));
472 
473 		if (remaining) {
474 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
475 						  remaining, TCP_RTO_MAX);
476 		} else {
477 			/* RTO revert clocked out retransmission.
478 			 * Will retransmit now */
479 			tcp_retransmit_timer(sk);
480 		}
481 
482 		break;
483 	case ICMP_TIME_EXCEEDED:
484 		err = EHOSTUNREACH;
485 		break;
486 	default:
487 		goto out;
488 	}
489 
490 	switch (sk->sk_state) {
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && !fastopen->sk)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key = NULL;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 	hash_location = tcp_parse_md5sig_option(th);
629 	if (sk && sk_fullsock(sk)) {
630 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
631 					&ip_hdr(skb)->saddr, AF_INET);
632 	} else if (hash_location) {
633 		/*
634 		 * active side is lost. Try to find listening socket through
635 		 * source port, and then find md5 key through listening socket.
636 		 * we are not loose security here:
637 		 * Incoming packet is checked with md5 hash with finding key,
638 		 * no RST generated if md5 hash doesn't match.
639 		 */
640 		sk1 = __inet_lookup_listener(net,
641 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
642 					     th->source, ip_hdr(skb)->daddr,
643 					     ntohs(th->source), inet_iif(skb));
644 		/* don't send rst if it can't find key */
645 		if (!sk1)
646 			return;
647 		rcu_read_lock();
648 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
649 					&ip_hdr(skb)->saddr, AF_INET);
650 		if (!key)
651 			goto release_sk1;
652 
653 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
654 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
655 			goto release_sk1;
656 	}
657 
658 	if (key) {
659 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 				   (TCPOPT_NOP << 16) |
661 				   (TCPOPT_MD5SIG << 8) |
662 				   TCPOLEN_MD5SIG);
663 		/* Update length and the length the header thinks exists */
664 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 		rep.th.doff = arg.iov[0].iov_len / 4;
666 
667 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
668 				     key, ip_hdr(skb)->saddr,
669 				     ip_hdr(skb)->daddr, &rep.th);
670 	}
671 #endif
672 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 				      ip_hdr(skb)->saddr, /* XXX */
674 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
675 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
676 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677 
678 	/* When socket is gone, all binding information is lost.
679 	 * routing might fail in this case. No choice here, if we choose to force
680 	 * input interface, we will misroute in case of asymmetric route.
681 	 */
682 	if (sk)
683 		arg.bound_dev_if = sk->sk_bound_dev_if;
684 
685 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
686 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
687 
688 	arg.tos = ip_hdr(skb)->tos;
689 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
690 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
691 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
692 			      &arg, arg.iov[0].iov_len);
693 
694 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
695 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
696 
697 #ifdef CONFIG_TCP_MD5SIG
698 release_sk1:
699 	if (sk1) {
700 		rcu_read_unlock();
701 		sock_put(sk1);
702 	}
703 #endif
704 }
705 
706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
707    outside socket context is ugly, certainly. What can I do?
708  */
709 
710 static void tcp_v4_send_ack(struct net *net,
711 			    struct sk_buff *skb, u32 seq, u32 ack,
712 			    u32 win, u32 tsval, u32 tsecr, int oif,
713 			    struct tcp_md5sig_key *key,
714 			    int reply_flags, u8 tos)
715 {
716 	const struct tcphdr *th = tcp_hdr(skb);
717 	struct {
718 		struct tcphdr th;
719 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
720 #ifdef CONFIG_TCP_MD5SIG
721 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
722 #endif
723 			];
724 	} rep;
725 	struct ip_reply_arg arg;
726 
727 	memset(&rep.th, 0, sizeof(struct tcphdr));
728 	memset(&arg, 0, sizeof(arg));
729 
730 	arg.iov[0].iov_base = (unsigned char *)&rep;
731 	arg.iov[0].iov_len  = sizeof(rep.th);
732 	if (tsecr) {
733 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
734 				   (TCPOPT_TIMESTAMP << 8) |
735 				   TCPOLEN_TIMESTAMP);
736 		rep.opt[1] = htonl(tsval);
737 		rep.opt[2] = htonl(tsecr);
738 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
739 	}
740 
741 	/* Swap the send and the receive. */
742 	rep.th.dest    = th->source;
743 	rep.th.source  = th->dest;
744 	rep.th.doff    = arg.iov[0].iov_len / 4;
745 	rep.th.seq     = htonl(seq);
746 	rep.th.ack_seq = htonl(ack);
747 	rep.th.ack     = 1;
748 	rep.th.window  = htons(win);
749 
750 #ifdef CONFIG_TCP_MD5SIG
751 	if (key) {
752 		int offset = (tsecr) ? 3 : 0;
753 
754 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
755 					  (TCPOPT_NOP << 16) |
756 					  (TCPOPT_MD5SIG << 8) |
757 					  TCPOLEN_MD5SIG);
758 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
759 		rep.th.doff = arg.iov[0].iov_len/4;
760 
761 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
762 				    key, ip_hdr(skb)->saddr,
763 				    ip_hdr(skb)->daddr, &rep.th);
764 	}
765 #endif
766 	arg.flags = reply_flags;
767 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
768 				      ip_hdr(skb)->saddr, /* XXX */
769 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
770 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
771 	if (oif)
772 		arg.bound_dev_if = oif;
773 	arg.tos = tos;
774 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
775 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
776 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777 			      &arg, arg.iov[0].iov_len);
778 
779 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
780 }
781 
782 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
783 {
784 	struct inet_timewait_sock *tw = inet_twsk(sk);
785 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
786 
787 	tcp_v4_send_ack(sock_net(sk), skb,
788 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
789 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
790 			tcp_time_stamp + tcptw->tw_ts_offset,
791 			tcptw->tw_ts_recent,
792 			tw->tw_bound_dev_if,
793 			tcp_twsk_md5_key(tcptw),
794 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
795 			tw->tw_tos
796 			);
797 
798 	inet_twsk_put(tw);
799 }
800 
801 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
802 				  struct request_sock *req)
803 {
804 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
805 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
806 	 */
807 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
808 					     tcp_sk(sk)->snd_nxt;
809 
810 	tcp_v4_send_ack(sock_net(sk), skb, seq,
811 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
812 			tcp_time_stamp,
813 			req->ts_recent,
814 			0,
815 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
816 					  AF_INET),
817 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
818 			ip_hdr(skb)->tos);
819 }
820 
821 /*
822  *	Send a SYN-ACK after having received a SYN.
823  *	This still operates on a request_sock only, not on a big
824  *	socket.
825  */
826 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
827 			      struct flowi *fl,
828 			      struct request_sock *req,
829 			      struct tcp_fastopen_cookie *foc,
830 				  bool attach_req)
831 {
832 	const struct inet_request_sock *ireq = inet_rsk(req);
833 	struct flowi4 fl4;
834 	int err = -1;
835 	struct sk_buff *skb;
836 
837 	/* First, grab a route. */
838 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
839 		return -1;
840 
841 	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
842 
843 	if (skb) {
844 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
845 
846 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
847 					    ireq->ir_rmt_addr,
848 					    ireq->opt);
849 		err = net_xmit_eval(err);
850 	}
851 
852 	return err;
853 }
854 
855 /*
856  *	IPv4 request_sock destructor.
857  */
858 static void tcp_v4_reqsk_destructor(struct request_sock *req)
859 {
860 	kfree(inet_rsk(req)->opt);
861 }
862 
863 
864 #ifdef CONFIG_TCP_MD5SIG
865 /*
866  * RFC2385 MD5 checksumming requires a mapping of
867  * IP address->MD5 Key.
868  * We need to maintain these in the sk structure.
869  */
870 
871 /* Find the Key structure for an address.  */
872 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
873 					 const union tcp_md5_addr *addr,
874 					 int family)
875 {
876 	const struct tcp_sock *tp = tcp_sk(sk);
877 	struct tcp_md5sig_key *key;
878 	unsigned int size = sizeof(struct in_addr);
879 	const struct tcp_md5sig_info *md5sig;
880 
881 	/* caller either holds rcu_read_lock() or socket lock */
882 	md5sig = rcu_dereference_check(tp->md5sig_info,
883 				       sock_owned_by_user(sk) ||
884 				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
885 	if (!md5sig)
886 		return NULL;
887 #if IS_ENABLED(CONFIG_IPV6)
888 	if (family == AF_INET6)
889 		size = sizeof(struct in6_addr);
890 #endif
891 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
892 		if (key->family != family)
893 			continue;
894 		if (!memcmp(&key->addr, addr, size))
895 			return key;
896 	}
897 	return NULL;
898 }
899 EXPORT_SYMBOL(tcp_md5_do_lookup);
900 
901 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
902 					 const struct sock *addr_sk)
903 {
904 	const union tcp_md5_addr *addr;
905 
906 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
907 	return tcp_md5_do_lookup(sk, addr, AF_INET);
908 }
909 EXPORT_SYMBOL(tcp_v4_md5_lookup);
910 
911 /* This can be called on a newly created socket, from other files */
912 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
913 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
914 {
915 	/* Add Key to the list */
916 	struct tcp_md5sig_key *key;
917 	struct tcp_sock *tp = tcp_sk(sk);
918 	struct tcp_md5sig_info *md5sig;
919 
920 	key = tcp_md5_do_lookup(sk, addr, family);
921 	if (key) {
922 		/* Pre-existing entry - just update that one. */
923 		memcpy(key->key, newkey, newkeylen);
924 		key->keylen = newkeylen;
925 		return 0;
926 	}
927 
928 	md5sig = rcu_dereference_protected(tp->md5sig_info,
929 					   sock_owned_by_user(sk) ||
930 					   lockdep_is_held(&sk->sk_lock.slock));
931 	if (!md5sig) {
932 		md5sig = kmalloc(sizeof(*md5sig), gfp);
933 		if (!md5sig)
934 			return -ENOMEM;
935 
936 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
937 		INIT_HLIST_HEAD(&md5sig->head);
938 		rcu_assign_pointer(tp->md5sig_info, md5sig);
939 	}
940 
941 	key = sock_kmalloc(sk, sizeof(*key), gfp);
942 	if (!key)
943 		return -ENOMEM;
944 	if (!tcp_alloc_md5sig_pool()) {
945 		sock_kfree_s(sk, key, sizeof(*key));
946 		return -ENOMEM;
947 	}
948 
949 	memcpy(key->key, newkey, newkeylen);
950 	key->keylen = newkeylen;
951 	key->family = family;
952 	memcpy(&key->addr, addr,
953 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
954 				      sizeof(struct in_addr));
955 	hlist_add_head_rcu(&key->node, &md5sig->head);
956 	return 0;
957 }
958 EXPORT_SYMBOL(tcp_md5_do_add);
959 
960 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
961 {
962 	struct tcp_md5sig_key *key;
963 
964 	key = tcp_md5_do_lookup(sk, addr, family);
965 	if (!key)
966 		return -ENOENT;
967 	hlist_del_rcu(&key->node);
968 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
969 	kfree_rcu(key, rcu);
970 	return 0;
971 }
972 EXPORT_SYMBOL(tcp_md5_do_del);
973 
974 static void tcp_clear_md5_list(struct sock *sk)
975 {
976 	struct tcp_sock *tp = tcp_sk(sk);
977 	struct tcp_md5sig_key *key;
978 	struct hlist_node *n;
979 	struct tcp_md5sig_info *md5sig;
980 
981 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
982 
983 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
984 		hlist_del_rcu(&key->node);
985 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
986 		kfree_rcu(key, rcu);
987 	}
988 }
989 
990 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
991 				 int optlen)
992 {
993 	struct tcp_md5sig cmd;
994 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
995 
996 	if (optlen < sizeof(cmd))
997 		return -EINVAL;
998 
999 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1000 		return -EFAULT;
1001 
1002 	if (sin->sin_family != AF_INET)
1003 		return -EINVAL;
1004 
1005 	if (!cmd.tcpm_keylen)
1006 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1007 				      AF_INET);
1008 
1009 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1010 		return -EINVAL;
1011 
1012 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1013 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1014 			      GFP_KERNEL);
1015 }
1016 
1017 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1018 					__be32 daddr, __be32 saddr, int nbytes)
1019 {
1020 	struct tcp4_pseudohdr *bp;
1021 	struct scatterlist sg;
1022 
1023 	bp = &hp->md5_blk.ip4;
1024 
1025 	/*
1026 	 * 1. the TCP pseudo-header (in the order: source IP address,
1027 	 * destination IP address, zero-padded protocol number, and
1028 	 * segment length)
1029 	 */
1030 	bp->saddr = saddr;
1031 	bp->daddr = daddr;
1032 	bp->pad = 0;
1033 	bp->protocol = IPPROTO_TCP;
1034 	bp->len = cpu_to_be16(nbytes);
1035 
1036 	sg_init_one(&sg, bp, sizeof(*bp));
1037 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1038 }
1039 
1040 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1041 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1042 {
1043 	struct tcp_md5sig_pool *hp;
1044 	struct hash_desc *desc;
1045 
1046 	hp = tcp_get_md5sig_pool();
1047 	if (!hp)
1048 		goto clear_hash_noput;
1049 	desc = &hp->md5_desc;
1050 
1051 	if (crypto_hash_init(desc))
1052 		goto clear_hash;
1053 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1054 		goto clear_hash;
1055 	if (tcp_md5_hash_header(hp, th))
1056 		goto clear_hash;
1057 	if (tcp_md5_hash_key(hp, key))
1058 		goto clear_hash;
1059 	if (crypto_hash_final(desc, md5_hash))
1060 		goto clear_hash;
1061 
1062 	tcp_put_md5sig_pool();
1063 	return 0;
1064 
1065 clear_hash:
1066 	tcp_put_md5sig_pool();
1067 clear_hash_noput:
1068 	memset(md5_hash, 0, 16);
1069 	return 1;
1070 }
1071 
1072 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1073 			const struct sock *sk,
1074 			const struct sk_buff *skb)
1075 {
1076 	struct tcp_md5sig_pool *hp;
1077 	struct hash_desc *desc;
1078 	const struct tcphdr *th = tcp_hdr(skb);
1079 	__be32 saddr, daddr;
1080 
1081 	if (sk) { /* valid for establish/request sockets */
1082 		saddr = sk->sk_rcv_saddr;
1083 		daddr = sk->sk_daddr;
1084 	} else {
1085 		const struct iphdr *iph = ip_hdr(skb);
1086 		saddr = iph->saddr;
1087 		daddr = iph->daddr;
1088 	}
1089 
1090 	hp = tcp_get_md5sig_pool();
1091 	if (!hp)
1092 		goto clear_hash_noput;
1093 	desc = &hp->md5_desc;
1094 
1095 	if (crypto_hash_init(desc))
1096 		goto clear_hash;
1097 
1098 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1099 		goto clear_hash;
1100 	if (tcp_md5_hash_header(hp, th))
1101 		goto clear_hash;
1102 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1103 		goto clear_hash;
1104 	if (tcp_md5_hash_key(hp, key))
1105 		goto clear_hash;
1106 	if (crypto_hash_final(desc, md5_hash))
1107 		goto clear_hash;
1108 
1109 	tcp_put_md5sig_pool();
1110 	return 0;
1111 
1112 clear_hash:
1113 	tcp_put_md5sig_pool();
1114 clear_hash_noput:
1115 	memset(md5_hash, 0, 16);
1116 	return 1;
1117 }
1118 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1119 
1120 #endif
1121 
1122 /* Called with rcu_read_lock() */
1123 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1124 				    const struct sk_buff *skb)
1125 {
1126 #ifdef CONFIG_TCP_MD5SIG
1127 	/*
1128 	 * This gets called for each TCP segment that arrives
1129 	 * so we want to be efficient.
1130 	 * We have 3 drop cases:
1131 	 * o No MD5 hash and one expected.
1132 	 * o MD5 hash and we're not expecting one.
1133 	 * o MD5 hash and its wrong.
1134 	 */
1135 	const __u8 *hash_location = NULL;
1136 	struct tcp_md5sig_key *hash_expected;
1137 	const struct iphdr *iph = ip_hdr(skb);
1138 	const struct tcphdr *th = tcp_hdr(skb);
1139 	int genhash;
1140 	unsigned char newhash[16];
1141 
1142 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1143 					  AF_INET);
1144 	hash_location = tcp_parse_md5sig_option(th);
1145 
1146 	/* We've parsed the options - do we have a hash? */
1147 	if (!hash_expected && !hash_location)
1148 		return false;
1149 
1150 	if (hash_expected && !hash_location) {
1151 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1152 		return true;
1153 	}
1154 
1155 	if (!hash_expected && hash_location) {
1156 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1157 		return true;
1158 	}
1159 
1160 	/* Okay, so this is hash_expected and hash_location -
1161 	 * so we need to calculate the checksum.
1162 	 */
1163 	genhash = tcp_v4_md5_hash_skb(newhash,
1164 				      hash_expected,
1165 				      NULL, skb);
1166 
1167 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1168 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1169 				     &iph->saddr, ntohs(th->source),
1170 				     &iph->daddr, ntohs(th->dest),
1171 				     genhash ? " tcp_v4_calc_md5_hash failed"
1172 				     : "");
1173 		return true;
1174 	}
1175 	return false;
1176 #endif
1177 	return false;
1178 }
1179 
1180 static void tcp_v4_init_req(struct request_sock *req,
1181 			    const struct sock *sk_listener,
1182 			    struct sk_buff *skb)
1183 {
1184 	struct inet_request_sock *ireq = inet_rsk(req);
1185 
1186 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1187 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1188 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1189 	ireq->opt = tcp_v4_save_options(skb);
1190 }
1191 
1192 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1193 					  struct flowi *fl,
1194 					  const struct request_sock *req,
1195 					  bool *strict)
1196 {
1197 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1198 
1199 	if (strict) {
1200 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1201 			*strict = true;
1202 		else
1203 			*strict = false;
1204 	}
1205 
1206 	return dst;
1207 }
1208 
1209 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1210 	.family		=	PF_INET,
1211 	.obj_size	=	sizeof(struct tcp_request_sock),
1212 	.rtx_syn_ack	=	tcp_rtx_synack,
1213 	.send_ack	=	tcp_v4_reqsk_send_ack,
1214 	.destructor	=	tcp_v4_reqsk_destructor,
1215 	.send_reset	=	tcp_v4_send_reset,
1216 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1217 };
1218 
1219 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1220 	.mss_clamp	=	TCP_MSS_DEFAULT,
1221 #ifdef CONFIG_TCP_MD5SIG
1222 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1223 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1224 #endif
1225 	.init_req	=	tcp_v4_init_req,
1226 #ifdef CONFIG_SYN_COOKIES
1227 	.cookie_init_seq =	cookie_v4_init_sequence,
1228 #endif
1229 	.route_req	=	tcp_v4_route_req,
1230 	.init_seq	=	tcp_v4_init_sequence,
1231 	.send_synack	=	tcp_v4_send_synack,
1232 };
1233 
1234 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1235 {
1236 	/* Never answer to SYNs send to broadcast or multicast */
1237 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1238 		goto drop;
1239 
1240 	return tcp_conn_request(&tcp_request_sock_ops,
1241 				&tcp_request_sock_ipv4_ops, sk, skb);
1242 
1243 drop:
1244 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1245 	return 0;
1246 }
1247 EXPORT_SYMBOL(tcp_v4_conn_request);
1248 
1249 
1250 /*
1251  * The three way handshake has completed - we got a valid synack -
1252  * now create the new socket.
1253  */
1254 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1255 				  struct request_sock *req,
1256 				  struct dst_entry *dst,
1257 				  struct request_sock *req_unhash,
1258 				  bool *own_req)
1259 {
1260 	struct inet_request_sock *ireq;
1261 	struct inet_sock *newinet;
1262 	struct tcp_sock *newtp;
1263 	struct sock *newsk;
1264 #ifdef CONFIG_TCP_MD5SIG
1265 	struct tcp_md5sig_key *key;
1266 #endif
1267 	struct ip_options_rcu *inet_opt;
1268 
1269 	if (sk_acceptq_is_full(sk))
1270 		goto exit_overflow;
1271 
1272 	newsk = tcp_create_openreq_child(sk, req, skb);
1273 	if (!newsk)
1274 		goto exit_nonewsk;
1275 
1276 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1277 	inet_sk_rx_dst_set(newsk, skb);
1278 
1279 	newtp		      = tcp_sk(newsk);
1280 	newinet		      = inet_sk(newsk);
1281 	ireq		      = inet_rsk(req);
1282 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1283 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1284 	newsk->sk_bound_dev_if = ireq->ir_iif;
1285 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1286 	inet_opt	      = ireq->opt;
1287 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1288 	ireq->opt	      = NULL;
1289 	newinet->mc_index     = inet_iif(skb);
1290 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1291 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1292 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1293 	if (inet_opt)
1294 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1295 	newinet->inet_id = newtp->write_seq ^ jiffies;
1296 
1297 	if (!dst) {
1298 		dst = inet_csk_route_child_sock(sk, newsk, req);
1299 		if (!dst)
1300 			goto put_and_exit;
1301 	} else {
1302 		/* syncookie case : see end of cookie_v4_check() */
1303 	}
1304 	sk_setup_caps(newsk, dst);
1305 
1306 	tcp_ca_openreq_child(newsk, dst);
1307 
1308 	tcp_sync_mss(newsk, dst_mtu(dst));
1309 	newtp->advmss = dst_metric_advmss(dst);
1310 	if (tcp_sk(sk)->rx_opt.user_mss &&
1311 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1312 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1313 
1314 	tcp_initialize_rcv_mss(newsk);
1315 
1316 #ifdef CONFIG_TCP_MD5SIG
1317 	/* Copy over the MD5 key from the original socket */
1318 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1319 				AF_INET);
1320 	if (key) {
1321 		/*
1322 		 * We're using one, so create a matching key
1323 		 * on the newsk structure. If we fail to get
1324 		 * memory, then we end up not copying the key
1325 		 * across. Shucks.
1326 		 */
1327 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1329 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1330 	}
1331 #endif
1332 
1333 	if (__inet_inherit_port(sk, newsk) < 0)
1334 		goto put_and_exit;
1335 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1336 	if (*own_req)
1337 		tcp_move_syn(newtp, req);
1338 
1339 	return newsk;
1340 
1341 exit_overflow:
1342 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1343 exit_nonewsk:
1344 	dst_release(dst);
1345 exit:
1346 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1347 	return NULL;
1348 put_and_exit:
1349 	inet_csk_prepare_forced_close(newsk);
1350 	tcp_done(newsk);
1351 	goto exit;
1352 }
1353 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1354 
1355 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1356 {
1357 #ifdef CONFIG_SYN_COOKIES
1358 	const struct tcphdr *th = tcp_hdr(skb);
1359 
1360 	if (!th->syn)
1361 		sk = cookie_v4_check(sk, skb);
1362 #endif
1363 	return sk;
1364 }
1365 
1366 /* The socket must have it's spinlock held when we get
1367  * here, unless it is a TCP_LISTEN socket.
1368  *
1369  * We have a potential double-lock case here, so even when
1370  * doing backlog processing we use the BH locking scheme.
1371  * This is because we cannot sleep with the original spinlock
1372  * held.
1373  */
1374 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1375 {
1376 	struct sock *rsk;
1377 
1378 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1379 		struct dst_entry *dst = sk->sk_rx_dst;
1380 
1381 		sock_rps_save_rxhash(sk, skb);
1382 		sk_mark_napi_id(sk, skb);
1383 		if (dst) {
1384 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1385 			    !dst->ops->check(dst, 0)) {
1386 				dst_release(dst);
1387 				sk->sk_rx_dst = NULL;
1388 			}
1389 		}
1390 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1391 		return 0;
1392 	}
1393 
1394 	if (tcp_checksum_complete(skb))
1395 		goto csum_err;
1396 
1397 	if (sk->sk_state == TCP_LISTEN) {
1398 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1399 
1400 		if (!nsk)
1401 			goto discard;
1402 		if (nsk != sk) {
1403 			sock_rps_save_rxhash(nsk, skb);
1404 			sk_mark_napi_id(nsk, skb);
1405 			if (tcp_child_process(sk, nsk, skb)) {
1406 				rsk = nsk;
1407 				goto reset;
1408 			}
1409 			return 0;
1410 		}
1411 	} else
1412 		sock_rps_save_rxhash(sk, skb);
1413 
1414 	if (tcp_rcv_state_process(sk, skb)) {
1415 		rsk = sk;
1416 		goto reset;
1417 	}
1418 	return 0;
1419 
1420 reset:
1421 	tcp_v4_send_reset(rsk, skb);
1422 discard:
1423 	kfree_skb(skb);
1424 	/* Be careful here. If this function gets more complicated and
1425 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1426 	 * might be destroyed here. This current version compiles correctly,
1427 	 * but you have been warned.
1428 	 */
1429 	return 0;
1430 
1431 csum_err:
1432 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1433 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1434 	goto discard;
1435 }
1436 EXPORT_SYMBOL(tcp_v4_do_rcv);
1437 
1438 void tcp_v4_early_demux(struct sk_buff *skb)
1439 {
1440 	const struct iphdr *iph;
1441 	const struct tcphdr *th;
1442 	struct sock *sk;
1443 
1444 	if (skb->pkt_type != PACKET_HOST)
1445 		return;
1446 
1447 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1448 		return;
1449 
1450 	iph = ip_hdr(skb);
1451 	th = tcp_hdr(skb);
1452 
1453 	if (th->doff < sizeof(struct tcphdr) / 4)
1454 		return;
1455 
1456 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1457 				       iph->saddr, th->source,
1458 				       iph->daddr, ntohs(th->dest),
1459 				       skb->skb_iif);
1460 	if (sk) {
1461 		skb->sk = sk;
1462 		skb->destructor = sock_edemux;
1463 		if (sk_fullsock(sk)) {
1464 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1465 
1466 			if (dst)
1467 				dst = dst_check(dst, 0);
1468 			if (dst &&
1469 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1470 				skb_dst_set_noref(skb, dst);
1471 		}
1472 	}
1473 }
1474 
1475 /* Packet is added to VJ-style prequeue for processing in process
1476  * context, if a reader task is waiting. Apparently, this exciting
1477  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1478  * failed somewhere. Latency? Burstiness? Well, at least now we will
1479  * see, why it failed. 8)8)				  --ANK
1480  *
1481  */
1482 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1483 {
1484 	struct tcp_sock *tp = tcp_sk(sk);
1485 
1486 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1487 		return false;
1488 
1489 	if (skb->len <= tcp_hdrlen(skb) &&
1490 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1491 		return false;
1492 
1493 	/* Before escaping RCU protected region, we need to take care of skb
1494 	 * dst. Prequeue is only enabled for established sockets.
1495 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1496 	 * Instead of doing full sk_rx_dst validity here, let's perform
1497 	 * an optimistic check.
1498 	 */
1499 	if (likely(sk->sk_rx_dst))
1500 		skb_dst_drop(skb);
1501 	else
1502 		skb_dst_force_safe(skb);
1503 
1504 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1505 	tp->ucopy.memory += skb->truesize;
1506 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1507 		struct sk_buff *skb1;
1508 
1509 		BUG_ON(sock_owned_by_user(sk));
1510 
1511 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1512 			sk_backlog_rcv(sk, skb1);
1513 			NET_INC_STATS_BH(sock_net(sk),
1514 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1515 		}
1516 
1517 		tp->ucopy.memory = 0;
1518 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1519 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1520 					   POLLIN | POLLRDNORM | POLLRDBAND);
1521 		if (!inet_csk_ack_scheduled(sk))
1522 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1523 						  (3 * tcp_rto_min(sk)) / 4,
1524 						  TCP_RTO_MAX);
1525 	}
1526 	return true;
1527 }
1528 EXPORT_SYMBOL(tcp_prequeue);
1529 
1530 /*
1531  *	From tcp_input.c
1532  */
1533 
1534 int tcp_v4_rcv(struct sk_buff *skb)
1535 {
1536 	const struct iphdr *iph;
1537 	const struct tcphdr *th;
1538 	struct sock *sk;
1539 	int ret;
1540 	struct net *net = dev_net(skb->dev);
1541 
1542 	if (skb->pkt_type != PACKET_HOST)
1543 		goto discard_it;
1544 
1545 	/* Count it even if it's bad */
1546 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1547 
1548 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1549 		goto discard_it;
1550 
1551 	th = tcp_hdr(skb);
1552 
1553 	if (th->doff < sizeof(struct tcphdr) / 4)
1554 		goto bad_packet;
1555 	if (!pskb_may_pull(skb, th->doff * 4))
1556 		goto discard_it;
1557 
1558 	/* An explanation is required here, I think.
1559 	 * Packet length and doff are validated by header prediction,
1560 	 * provided case of th->doff==0 is eliminated.
1561 	 * So, we defer the checks. */
1562 
1563 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1564 		goto csum_error;
1565 
1566 	th = tcp_hdr(skb);
1567 	iph = ip_hdr(skb);
1568 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1569 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1570 	 */
1571 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1572 		sizeof(struct inet_skb_parm));
1573 	barrier();
1574 
1575 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1576 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1577 				    skb->len - th->doff * 4);
1578 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1579 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1580 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1581 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1582 	TCP_SKB_CB(skb)->sacked	 = 0;
1583 
1584 lookup:
1585 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1586 	if (!sk)
1587 		goto no_tcp_socket;
1588 
1589 process:
1590 	if (sk->sk_state == TCP_TIME_WAIT)
1591 		goto do_time_wait;
1592 
1593 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1594 		struct request_sock *req = inet_reqsk(sk);
1595 		struct sock *nsk = NULL;
1596 
1597 		sk = req->rsk_listener;
1598 		if (tcp_v4_inbound_md5_hash(sk, skb))
1599 			goto discard_and_relse;
1600 		if (likely(sk->sk_state == TCP_LISTEN)) {
1601 			nsk = tcp_check_req(sk, skb, req, false);
1602 		} else {
1603 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1604 			goto lookup;
1605 		}
1606 		if (!nsk) {
1607 			reqsk_put(req);
1608 			goto discard_it;
1609 		}
1610 		if (nsk == sk) {
1611 			sock_hold(sk);
1612 			reqsk_put(req);
1613 		} else if (tcp_child_process(sk, nsk, skb)) {
1614 			tcp_v4_send_reset(nsk, skb);
1615 			goto discard_it;
1616 		} else {
1617 			return 0;
1618 		}
1619 	}
1620 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1621 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1622 		goto discard_and_relse;
1623 	}
1624 
1625 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1626 		goto discard_and_relse;
1627 
1628 	if (tcp_v4_inbound_md5_hash(sk, skb))
1629 		goto discard_and_relse;
1630 
1631 	nf_reset(skb);
1632 
1633 	if (sk_filter(sk, skb))
1634 		goto discard_and_relse;
1635 
1636 	skb->dev = NULL;
1637 
1638 	if (sk->sk_state == TCP_LISTEN) {
1639 		ret = tcp_v4_do_rcv(sk, skb);
1640 		goto put_and_return;
1641 	}
1642 
1643 	sk_incoming_cpu_update(sk);
1644 
1645 	bh_lock_sock_nested(sk);
1646 	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1647 	ret = 0;
1648 	if (!sock_owned_by_user(sk)) {
1649 		if (!tcp_prequeue(sk, skb))
1650 			ret = tcp_v4_do_rcv(sk, skb);
1651 	} else if (unlikely(sk_add_backlog(sk, skb,
1652 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1653 		bh_unlock_sock(sk);
1654 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1655 		goto discard_and_relse;
1656 	}
1657 	bh_unlock_sock(sk);
1658 
1659 put_and_return:
1660 	sock_put(sk);
1661 
1662 	return ret;
1663 
1664 no_tcp_socket:
1665 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1666 		goto discard_it;
1667 
1668 	if (tcp_checksum_complete(skb)) {
1669 csum_error:
1670 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1671 bad_packet:
1672 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1673 	} else {
1674 		tcp_v4_send_reset(NULL, skb);
1675 	}
1676 
1677 discard_it:
1678 	/* Discard frame. */
1679 	kfree_skb(skb);
1680 	return 0;
1681 
1682 discard_and_relse:
1683 	sock_put(sk);
1684 	goto discard_it;
1685 
1686 do_time_wait:
1687 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1688 		inet_twsk_put(inet_twsk(sk));
1689 		goto discard_it;
1690 	}
1691 
1692 	if (tcp_checksum_complete(skb)) {
1693 		inet_twsk_put(inet_twsk(sk));
1694 		goto csum_error;
1695 	}
1696 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1697 	case TCP_TW_SYN: {
1698 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1699 							&tcp_hashinfo,
1700 							iph->saddr, th->source,
1701 							iph->daddr, th->dest,
1702 							inet_iif(skb));
1703 		if (sk2) {
1704 			inet_twsk_deschedule_put(inet_twsk(sk));
1705 			sk = sk2;
1706 			goto process;
1707 		}
1708 		/* Fall through to ACK */
1709 	}
1710 	case TCP_TW_ACK:
1711 		tcp_v4_timewait_ack(sk, skb);
1712 		break;
1713 	case TCP_TW_RST:
1714 		tcp_v4_send_reset(sk, skb);
1715 		inet_twsk_deschedule_put(inet_twsk(sk));
1716 		goto discard_it;
1717 	case TCP_TW_SUCCESS:;
1718 	}
1719 	goto discard_it;
1720 }
1721 
1722 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1723 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1724 	.twsk_unique	= tcp_twsk_unique,
1725 	.twsk_destructor= tcp_twsk_destructor,
1726 };
1727 
1728 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1729 {
1730 	struct dst_entry *dst = skb_dst(skb);
1731 
1732 	if (dst && dst_hold_safe(dst)) {
1733 		sk->sk_rx_dst = dst;
1734 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1735 	}
1736 }
1737 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1738 
1739 const struct inet_connection_sock_af_ops ipv4_specific = {
1740 	.queue_xmit	   = ip_queue_xmit,
1741 	.send_check	   = tcp_v4_send_check,
1742 	.rebuild_header	   = inet_sk_rebuild_header,
1743 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1744 	.conn_request	   = tcp_v4_conn_request,
1745 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1746 	.net_header_len	   = sizeof(struct iphdr),
1747 	.setsockopt	   = ip_setsockopt,
1748 	.getsockopt	   = ip_getsockopt,
1749 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1750 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1751 	.bind_conflict	   = inet_csk_bind_conflict,
1752 #ifdef CONFIG_COMPAT
1753 	.compat_setsockopt = compat_ip_setsockopt,
1754 	.compat_getsockopt = compat_ip_getsockopt,
1755 #endif
1756 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1757 };
1758 EXPORT_SYMBOL(ipv4_specific);
1759 
1760 #ifdef CONFIG_TCP_MD5SIG
1761 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1762 	.md5_lookup		= tcp_v4_md5_lookup,
1763 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1764 	.md5_parse		= tcp_v4_parse_md5_keys,
1765 };
1766 #endif
1767 
1768 /* NOTE: A lot of things set to zero explicitly by call to
1769  *       sk_alloc() so need not be done here.
1770  */
1771 static int tcp_v4_init_sock(struct sock *sk)
1772 {
1773 	struct inet_connection_sock *icsk = inet_csk(sk);
1774 
1775 	tcp_init_sock(sk);
1776 
1777 	icsk->icsk_af_ops = &ipv4_specific;
1778 
1779 #ifdef CONFIG_TCP_MD5SIG
1780 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1781 #endif
1782 
1783 	return 0;
1784 }
1785 
1786 void tcp_v4_destroy_sock(struct sock *sk)
1787 {
1788 	struct tcp_sock *tp = tcp_sk(sk);
1789 
1790 	tcp_clear_xmit_timers(sk);
1791 
1792 	tcp_cleanup_congestion_control(sk);
1793 
1794 	/* Cleanup up the write buffer. */
1795 	tcp_write_queue_purge(sk);
1796 
1797 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1798 	__skb_queue_purge(&tp->out_of_order_queue);
1799 
1800 #ifdef CONFIG_TCP_MD5SIG
1801 	/* Clean up the MD5 key list, if any */
1802 	if (tp->md5sig_info) {
1803 		tcp_clear_md5_list(sk);
1804 		kfree_rcu(tp->md5sig_info, rcu);
1805 		tp->md5sig_info = NULL;
1806 	}
1807 #endif
1808 
1809 	/* Clean prequeue, it must be empty really */
1810 	__skb_queue_purge(&tp->ucopy.prequeue);
1811 
1812 	/* Clean up a referenced TCP bind bucket. */
1813 	if (inet_csk(sk)->icsk_bind_hash)
1814 		inet_put_port(sk);
1815 
1816 	BUG_ON(tp->fastopen_rsk);
1817 
1818 	/* If socket is aborted during connect operation */
1819 	tcp_free_fastopen_req(tp);
1820 	tcp_saved_syn_free(tp);
1821 
1822 	sk_sockets_allocated_dec(sk);
1823 
1824 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1825 		sock_release_memcg(sk);
1826 }
1827 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1828 
1829 #ifdef CONFIG_PROC_FS
1830 /* Proc filesystem TCP sock list dumping. */
1831 
1832 /*
1833  * Get next listener socket follow cur.  If cur is NULL, get first socket
1834  * starting from bucket given in st->bucket; when st->bucket is zero the
1835  * very first socket in the hash table is returned.
1836  */
1837 static void *listening_get_next(struct seq_file *seq, void *cur)
1838 {
1839 	struct inet_connection_sock *icsk;
1840 	struct hlist_nulls_node *node;
1841 	struct sock *sk = cur;
1842 	struct inet_listen_hashbucket *ilb;
1843 	struct tcp_iter_state *st = seq->private;
1844 	struct net *net = seq_file_net(seq);
1845 
1846 	if (!sk) {
1847 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1848 		spin_lock_bh(&ilb->lock);
1849 		sk = sk_nulls_head(&ilb->head);
1850 		st->offset = 0;
1851 		goto get_sk;
1852 	}
1853 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1854 	++st->num;
1855 	++st->offset;
1856 
1857 	sk = sk_nulls_next(sk);
1858 get_sk:
1859 	sk_nulls_for_each_from(sk, node) {
1860 		if (!net_eq(sock_net(sk), net))
1861 			continue;
1862 		if (sk->sk_family == st->family) {
1863 			cur = sk;
1864 			goto out;
1865 		}
1866 		icsk = inet_csk(sk);
1867 	}
1868 	spin_unlock_bh(&ilb->lock);
1869 	st->offset = 0;
1870 	if (++st->bucket < INET_LHTABLE_SIZE) {
1871 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1872 		spin_lock_bh(&ilb->lock);
1873 		sk = sk_nulls_head(&ilb->head);
1874 		goto get_sk;
1875 	}
1876 	cur = NULL;
1877 out:
1878 	return cur;
1879 }
1880 
1881 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1882 {
1883 	struct tcp_iter_state *st = seq->private;
1884 	void *rc;
1885 
1886 	st->bucket = 0;
1887 	st->offset = 0;
1888 	rc = listening_get_next(seq, NULL);
1889 
1890 	while (rc && *pos) {
1891 		rc = listening_get_next(seq, rc);
1892 		--*pos;
1893 	}
1894 	return rc;
1895 }
1896 
1897 static inline bool empty_bucket(const struct tcp_iter_state *st)
1898 {
1899 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1900 }
1901 
1902 /*
1903  * Get first established socket starting from bucket given in st->bucket.
1904  * If st->bucket is zero, the very first socket in the hash is returned.
1905  */
1906 static void *established_get_first(struct seq_file *seq)
1907 {
1908 	struct tcp_iter_state *st = seq->private;
1909 	struct net *net = seq_file_net(seq);
1910 	void *rc = NULL;
1911 
1912 	st->offset = 0;
1913 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1914 		struct sock *sk;
1915 		struct hlist_nulls_node *node;
1916 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1917 
1918 		/* Lockless fast path for the common case of empty buckets */
1919 		if (empty_bucket(st))
1920 			continue;
1921 
1922 		spin_lock_bh(lock);
1923 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1924 			if (sk->sk_family != st->family ||
1925 			    !net_eq(sock_net(sk), net)) {
1926 				continue;
1927 			}
1928 			rc = sk;
1929 			goto out;
1930 		}
1931 		spin_unlock_bh(lock);
1932 	}
1933 out:
1934 	return rc;
1935 }
1936 
1937 static void *established_get_next(struct seq_file *seq, void *cur)
1938 {
1939 	struct sock *sk = cur;
1940 	struct hlist_nulls_node *node;
1941 	struct tcp_iter_state *st = seq->private;
1942 	struct net *net = seq_file_net(seq);
1943 
1944 	++st->num;
1945 	++st->offset;
1946 
1947 	sk = sk_nulls_next(sk);
1948 
1949 	sk_nulls_for_each_from(sk, node) {
1950 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1951 			return sk;
1952 	}
1953 
1954 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1955 	++st->bucket;
1956 	return established_get_first(seq);
1957 }
1958 
1959 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1960 {
1961 	struct tcp_iter_state *st = seq->private;
1962 	void *rc;
1963 
1964 	st->bucket = 0;
1965 	rc = established_get_first(seq);
1966 
1967 	while (rc && pos) {
1968 		rc = established_get_next(seq, rc);
1969 		--pos;
1970 	}
1971 	return rc;
1972 }
1973 
1974 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1975 {
1976 	void *rc;
1977 	struct tcp_iter_state *st = seq->private;
1978 
1979 	st->state = TCP_SEQ_STATE_LISTENING;
1980 	rc	  = listening_get_idx(seq, &pos);
1981 
1982 	if (!rc) {
1983 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1984 		rc	  = established_get_idx(seq, pos);
1985 	}
1986 
1987 	return rc;
1988 }
1989 
1990 static void *tcp_seek_last_pos(struct seq_file *seq)
1991 {
1992 	struct tcp_iter_state *st = seq->private;
1993 	int offset = st->offset;
1994 	int orig_num = st->num;
1995 	void *rc = NULL;
1996 
1997 	switch (st->state) {
1998 	case TCP_SEQ_STATE_LISTENING:
1999 		if (st->bucket >= INET_LHTABLE_SIZE)
2000 			break;
2001 		st->state = TCP_SEQ_STATE_LISTENING;
2002 		rc = listening_get_next(seq, NULL);
2003 		while (offset-- && rc)
2004 			rc = listening_get_next(seq, rc);
2005 		if (rc)
2006 			break;
2007 		st->bucket = 0;
2008 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2009 		/* Fallthrough */
2010 	case TCP_SEQ_STATE_ESTABLISHED:
2011 		if (st->bucket > tcp_hashinfo.ehash_mask)
2012 			break;
2013 		rc = established_get_first(seq);
2014 		while (offset-- && rc)
2015 			rc = established_get_next(seq, rc);
2016 	}
2017 
2018 	st->num = orig_num;
2019 
2020 	return rc;
2021 }
2022 
2023 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2024 {
2025 	struct tcp_iter_state *st = seq->private;
2026 	void *rc;
2027 
2028 	if (*pos && *pos == st->last_pos) {
2029 		rc = tcp_seek_last_pos(seq);
2030 		if (rc)
2031 			goto out;
2032 	}
2033 
2034 	st->state = TCP_SEQ_STATE_LISTENING;
2035 	st->num = 0;
2036 	st->bucket = 0;
2037 	st->offset = 0;
2038 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2039 
2040 out:
2041 	st->last_pos = *pos;
2042 	return rc;
2043 }
2044 
2045 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2046 {
2047 	struct tcp_iter_state *st = seq->private;
2048 	void *rc = NULL;
2049 
2050 	if (v == SEQ_START_TOKEN) {
2051 		rc = tcp_get_idx(seq, 0);
2052 		goto out;
2053 	}
2054 
2055 	switch (st->state) {
2056 	case TCP_SEQ_STATE_LISTENING:
2057 		rc = listening_get_next(seq, v);
2058 		if (!rc) {
2059 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2060 			st->bucket = 0;
2061 			st->offset = 0;
2062 			rc	  = established_get_first(seq);
2063 		}
2064 		break;
2065 	case TCP_SEQ_STATE_ESTABLISHED:
2066 		rc = established_get_next(seq, v);
2067 		break;
2068 	}
2069 out:
2070 	++*pos;
2071 	st->last_pos = *pos;
2072 	return rc;
2073 }
2074 
2075 static void tcp_seq_stop(struct seq_file *seq, void *v)
2076 {
2077 	struct tcp_iter_state *st = seq->private;
2078 
2079 	switch (st->state) {
2080 	case TCP_SEQ_STATE_LISTENING:
2081 		if (v != SEQ_START_TOKEN)
2082 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2083 		break;
2084 	case TCP_SEQ_STATE_ESTABLISHED:
2085 		if (v)
2086 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2087 		break;
2088 	}
2089 }
2090 
2091 int tcp_seq_open(struct inode *inode, struct file *file)
2092 {
2093 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2094 	struct tcp_iter_state *s;
2095 	int err;
2096 
2097 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2098 			  sizeof(struct tcp_iter_state));
2099 	if (err < 0)
2100 		return err;
2101 
2102 	s = ((struct seq_file *)file->private_data)->private;
2103 	s->family		= afinfo->family;
2104 	s->last_pos		= 0;
2105 	return 0;
2106 }
2107 EXPORT_SYMBOL(tcp_seq_open);
2108 
2109 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2110 {
2111 	int rc = 0;
2112 	struct proc_dir_entry *p;
2113 
2114 	afinfo->seq_ops.start		= tcp_seq_start;
2115 	afinfo->seq_ops.next		= tcp_seq_next;
2116 	afinfo->seq_ops.stop		= tcp_seq_stop;
2117 
2118 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2119 			     afinfo->seq_fops, afinfo);
2120 	if (!p)
2121 		rc = -ENOMEM;
2122 	return rc;
2123 }
2124 EXPORT_SYMBOL(tcp_proc_register);
2125 
2126 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2127 {
2128 	remove_proc_entry(afinfo->name, net->proc_net);
2129 }
2130 EXPORT_SYMBOL(tcp_proc_unregister);
2131 
2132 static void get_openreq4(const struct request_sock *req,
2133 			 struct seq_file *f, int i)
2134 {
2135 	const struct inet_request_sock *ireq = inet_rsk(req);
2136 	long delta = req->rsk_timer.expires - jiffies;
2137 
2138 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2139 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2140 		i,
2141 		ireq->ir_loc_addr,
2142 		ireq->ir_num,
2143 		ireq->ir_rmt_addr,
2144 		ntohs(ireq->ir_rmt_port),
2145 		TCP_SYN_RECV,
2146 		0, 0, /* could print option size, but that is af dependent. */
2147 		1,    /* timers active (only the expire timer) */
2148 		jiffies_delta_to_clock_t(delta),
2149 		req->num_timeout,
2150 		from_kuid_munged(seq_user_ns(f),
2151 				 sock_i_uid(req->rsk_listener)),
2152 		0,  /* non standard timer */
2153 		0, /* open_requests have no inode */
2154 		0,
2155 		req);
2156 }
2157 
2158 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2159 {
2160 	int timer_active;
2161 	unsigned long timer_expires;
2162 	const struct tcp_sock *tp = tcp_sk(sk);
2163 	const struct inet_connection_sock *icsk = inet_csk(sk);
2164 	const struct inet_sock *inet = inet_sk(sk);
2165 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2166 	__be32 dest = inet->inet_daddr;
2167 	__be32 src = inet->inet_rcv_saddr;
2168 	__u16 destp = ntohs(inet->inet_dport);
2169 	__u16 srcp = ntohs(inet->inet_sport);
2170 	int rx_queue;
2171 	int state;
2172 
2173 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2174 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2175 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2176 		timer_active	= 1;
2177 		timer_expires	= icsk->icsk_timeout;
2178 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2179 		timer_active	= 4;
2180 		timer_expires	= icsk->icsk_timeout;
2181 	} else if (timer_pending(&sk->sk_timer)) {
2182 		timer_active	= 2;
2183 		timer_expires	= sk->sk_timer.expires;
2184 	} else {
2185 		timer_active	= 0;
2186 		timer_expires = jiffies;
2187 	}
2188 
2189 	state = sk_state_load(sk);
2190 	if (state == TCP_LISTEN)
2191 		rx_queue = sk->sk_ack_backlog;
2192 	else
2193 		/* Because we don't lock the socket,
2194 		 * we might find a transient negative value.
2195 		 */
2196 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2197 
2198 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2199 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2200 		i, src, srcp, dest, destp, state,
2201 		tp->write_seq - tp->snd_una,
2202 		rx_queue,
2203 		timer_active,
2204 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2205 		icsk->icsk_retransmits,
2206 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2207 		icsk->icsk_probes_out,
2208 		sock_i_ino(sk),
2209 		atomic_read(&sk->sk_refcnt), sk,
2210 		jiffies_to_clock_t(icsk->icsk_rto),
2211 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2212 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2213 		tp->snd_cwnd,
2214 		state == TCP_LISTEN ?
2215 		    fastopenq->max_qlen :
2216 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2217 }
2218 
2219 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2220 			       struct seq_file *f, int i)
2221 {
2222 	long delta = tw->tw_timer.expires - jiffies;
2223 	__be32 dest, src;
2224 	__u16 destp, srcp;
2225 
2226 	dest  = tw->tw_daddr;
2227 	src   = tw->tw_rcv_saddr;
2228 	destp = ntohs(tw->tw_dport);
2229 	srcp  = ntohs(tw->tw_sport);
2230 
2231 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2232 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2233 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2234 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2235 		atomic_read(&tw->tw_refcnt), tw);
2236 }
2237 
2238 #define TMPSZ 150
2239 
2240 static int tcp4_seq_show(struct seq_file *seq, void *v)
2241 {
2242 	struct tcp_iter_state *st;
2243 	struct sock *sk = v;
2244 
2245 	seq_setwidth(seq, TMPSZ - 1);
2246 	if (v == SEQ_START_TOKEN) {
2247 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2248 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2249 			   "inode");
2250 		goto out;
2251 	}
2252 	st = seq->private;
2253 
2254 	if (sk->sk_state == TCP_TIME_WAIT)
2255 		get_timewait4_sock(v, seq, st->num);
2256 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2257 		get_openreq4(v, seq, st->num);
2258 	else
2259 		get_tcp4_sock(v, seq, st->num);
2260 out:
2261 	seq_pad(seq, '\n');
2262 	return 0;
2263 }
2264 
2265 static const struct file_operations tcp_afinfo_seq_fops = {
2266 	.owner   = THIS_MODULE,
2267 	.open    = tcp_seq_open,
2268 	.read    = seq_read,
2269 	.llseek  = seq_lseek,
2270 	.release = seq_release_net
2271 };
2272 
2273 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2274 	.name		= "tcp",
2275 	.family		= AF_INET,
2276 	.seq_fops	= &tcp_afinfo_seq_fops,
2277 	.seq_ops	= {
2278 		.show		= tcp4_seq_show,
2279 	},
2280 };
2281 
2282 static int __net_init tcp4_proc_init_net(struct net *net)
2283 {
2284 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2285 }
2286 
2287 static void __net_exit tcp4_proc_exit_net(struct net *net)
2288 {
2289 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2290 }
2291 
2292 static struct pernet_operations tcp4_net_ops = {
2293 	.init = tcp4_proc_init_net,
2294 	.exit = tcp4_proc_exit_net,
2295 };
2296 
2297 int __init tcp4_proc_init(void)
2298 {
2299 	return register_pernet_subsys(&tcp4_net_ops);
2300 }
2301 
2302 void tcp4_proc_exit(void)
2303 {
2304 	unregister_pernet_subsys(&tcp4_net_ops);
2305 }
2306 #endif /* CONFIG_PROC_FS */
2307 
2308 struct proto tcp_prot = {
2309 	.name			= "TCP",
2310 	.owner			= THIS_MODULE,
2311 	.close			= tcp_close,
2312 	.connect		= tcp_v4_connect,
2313 	.disconnect		= tcp_disconnect,
2314 	.accept			= inet_csk_accept,
2315 	.ioctl			= tcp_ioctl,
2316 	.init			= tcp_v4_init_sock,
2317 	.destroy		= tcp_v4_destroy_sock,
2318 	.shutdown		= tcp_shutdown,
2319 	.setsockopt		= tcp_setsockopt,
2320 	.getsockopt		= tcp_getsockopt,
2321 	.recvmsg		= tcp_recvmsg,
2322 	.sendmsg		= tcp_sendmsg,
2323 	.sendpage		= tcp_sendpage,
2324 	.backlog_rcv		= tcp_v4_do_rcv,
2325 	.release_cb		= tcp_release_cb,
2326 	.hash			= inet_hash,
2327 	.unhash			= inet_unhash,
2328 	.get_port		= inet_csk_get_port,
2329 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2330 	.stream_memory_free	= tcp_stream_memory_free,
2331 	.sockets_allocated	= &tcp_sockets_allocated,
2332 	.orphan_count		= &tcp_orphan_count,
2333 	.memory_allocated	= &tcp_memory_allocated,
2334 	.memory_pressure	= &tcp_memory_pressure,
2335 	.sysctl_mem		= sysctl_tcp_mem,
2336 	.sysctl_wmem		= sysctl_tcp_wmem,
2337 	.sysctl_rmem		= sysctl_tcp_rmem,
2338 	.max_header		= MAX_TCP_HEADER,
2339 	.obj_size		= sizeof(struct tcp_sock),
2340 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2341 	.twsk_prot		= &tcp_timewait_sock_ops,
2342 	.rsk_prot		= &tcp_request_sock_ops,
2343 	.h.hashinfo		= &tcp_hashinfo,
2344 	.no_autobind		= true,
2345 #ifdef CONFIG_COMPAT
2346 	.compat_setsockopt	= compat_tcp_setsockopt,
2347 	.compat_getsockopt	= compat_tcp_getsockopt,
2348 #endif
2349 	.diag_destroy		= tcp_abort,
2350 };
2351 EXPORT_SYMBOL(tcp_prot);
2352 
2353 static void __net_exit tcp_sk_exit(struct net *net)
2354 {
2355 	int cpu;
2356 
2357 	for_each_possible_cpu(cpu)
2358 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2359 	free_percpu(net->ipv4.tcp_sk);
2360 }
2361 
2362 static int __net_init tcp_sk_init(struct net *net)
2363 {
2364 	int res, cpu;
2365 
2366 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2367 	if (!net->ipv4.tcp_sk)
2368 		return -ENOMEM;
2369 
2370 	for_each_possible_cpu(cpu) {
2371 		struct sock *sk;
2372 
2373 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2374 					   IPPROTO_TCP, net);
2375 		if (res)
2376 			goto fail;
2377 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2378 	}
2379 
2380 	net->ipv4.sysctl_tcp_ecn = 2;
2381 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2382 
2383 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2384 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2385 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2386 
2387 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2388 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2389 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2390 
2391 	return 0;
2392 fail:
2393 	tcp_sk_exit(net);
2394 
2395 	return res;
2396 }
2397 
2398 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2399 {
2400 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2401 }
2402 
2403 static struct pernet_operations __net_initdata tcp_sk_ops = {
2404        .init	   = tcp_sk_init,
2405        .exit	   = tcp_sk_exit,
2406        .exit_batch = tcp_sk_exit_batch,
2407 };
2408 
2409 void __init tcp_v4_init(void)
2410 {
2411 	inet_hashinfo_init(&tcp_hashinfo);
2412 	if (register_pernet_subsys(&tcp_sk_ops))
2413 		panic("Failed to create the TCP control socket.\n");
2414 }
2415