xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision a06c488d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     sock_owned_by_user(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	WARN_ON(req->sk);
323 
324 	if (seq != tcp_rsk(req)->snt_isn) {
325 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326 	} else {
327 		/*
328 		 * Still in SYN_RECV, just remove it silently.
329 		 * There is no good way to pass the error to the newly
330 		 * created socket, and POSIX does not want network
331 		 * errors returned from accept().
332 		 */
333 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334 		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335 	}
336 	reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339 
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355 
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360 	struct inet_connection_sock *icsk;
361 	struct tcp_sock *tp;
362 	struct inet_sock *inet;
363 	const int type = icmp_hdr(icmp_skb)->type;
364 	const int code = icmp_hdr(icmp_skb)->code;
365 	struct sock *sk;
366 	struct sk_buff *skb;
367 	struct request_sock *fastopen;
368 	__u32 seq, snd_una;
369 	__u32 remaining;
370 	int err;
371 	struct net *net = dev_net(icmp_skb->dev);
372 
373 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 				       th->dest, iph->saddr, ntohs(th->source),
375 				       inet_iif(icmp_skb));
376 	if (!sk) {
377 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378 		return;
379 	}
380 	if (sk->sk_state == TCP_TIME_WAIT) {
381 		inet_twsk_put(inet_twsk(sk));
382 		return;
383 	}
384 	seq = ntohl(th->seq);
385 	if (sk->sk_state == TCP_NEW_SYN_RECV)
386 		return tcp_req_err(sk, seq);
387 
388 	bh_lock_sock(sk);
389 	/* If too many ICMPs get dropped on busy
390 	 * servers this needs to be solved differently.
391 	 * We do take care of PMTU discovery (RFC1191) special case :
392 	 * we can receive locally generated ICMP messages while socket is held.
393 	 */
394 	if (sock_owned_by_user(sk)) {
395 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
396 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
397 	}
398 	if (sk->sk_state == TCP_CLOSE)
399 		goto out;
400 
401 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
402 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
403 		goto out;
404 	}
405 
406 	icsk = inet_csk(sk);
407 	tp = tcp_sk(sk);
408 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
409 	fastopen = tp->fastopen_rsk;
410 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
411 	if (sk->sk_state != TCP_LISTEN &&
412 	    !between(seq, snd_una, tp->snd_nxt)) {
413 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
414 		goto out;
415 	}
416 
417 	switch (type) {
418 	case ICMP_REDIRECT:
419 		do_redirect(icmp_skb, sk);
420 		goto out;
421 	case ICMP_SOURCE_QUENCH:
422 		/* Just silently ignore these. */
423 		goto out;
424 	case ICMP_PARAMETERPROB:
425 		err = EPROTO;
426 		break;
427 	case ICMP_DEST_UNREACH:
428 		if (code > NR_ICMP_UNREACH)
429 			goto out;
430 
431 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
432 			/* We are not interested in TCP_LISTEN and open_requests
433 			 * (SYN-ACKs send out by Linux are always <576bytes so
434 			 * they should go through unfragmented).
435 			 */
436 			if (sk->sk_state == TCP_LISTEN)
437 				goto out;
438 
439 			tp->mtu_info = info;
440 			if (!sock_owned_by_user(sk)) {
441 				tcp_v4_mtu_reduced(sk);
442 			} else {
443 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
444 					sock_hold(sk);
445 			}
446 			goto out;
447 		}
448 
449 		err = icmp_err_convert[code].errno;
450 		/* check if icmp_skb allows revert of backoff
451 		 * (see draft-zimmermann-tcp-lcd) */
452 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
453 			break;
454 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 		    !icsk->icsk_backoff || fastopen)
456 			break;
457 
458 		if (sock_owned_by_user(sk))
459 			break;
460 
461 		icsk->icsk_backoff--;
462 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
463 					       TCP_TIMEOUT_INIT;
464 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
465 
466 		skb = tcp_write_queue_head(sk);
467 		BUG_ON(!skb);
468 
469 		remaining = icsk->icsk_rto -
470 			    min(icsk->icsk_rto,
471 				tcp_time_stamp - tcp_skb_timestamp(skb));
472 
473 		if (remaining) {
474 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
475 						  remaining, TCP_RTO_MAX);
476 		} else {
477 			/* RTO revert clocked out retransmission.
478 			 * Will retransmit now */
479 			tcp_retransmit_timer(sk);
480 		}
481 
482 		break;
483 	case ICMP_TIME_EXCEEDED:
484 		err = EHOSTUNREACH;
485 		break;
486 	default:
487 		goto out;
488 	}
489 
490 	switch (sk->sk_state) {
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && !fastopen->sk)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key = NULL;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 	hash_location = tcp_parse_md5sig_option(th);
629 	if (sk && sk_fullsock(sk)) {
630 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
631 					&ip_hdr(skb)->saddr, AF_INET);
632 	} else if (hash_location) {
633 		/*
634 		 * active side is lost. Try to find listening socket through
635 		 * source port, and then find md5 key through listening socket.
636 		 * we are not loose security here:
637 		 * Incoming packet is checked with md5 hash with finding key,
638 		 * no RST generated if md5 hash doesn't match.
639 		 */
640 		sk1 = __inet_lookup_listener(net,
641 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
642 					     th->source, ip_hdr(skb)->daddr,
643 					     ntohs(th->source), inet_iif(skb));
644 		/* don't send rst if it can't find key */
645 		if (!sk1)
646 			return;
647 		rcu_read_lock();
648 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
649 					&ip_hdr(skb)->saddr, AF_INET);
650 		if (!key)
651 			goto release_sk1;
652 
653 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
654 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
655 			goto release_sk1;
656 	}
657 
658 	if (key) {
659 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 				   (TCPOPT_NOP << 16) |
661 				   (TCPOPT_MD5SIG << 8) |
662 				   TCPOLEN_MD5SIG);
663 		/* Update length and the length the header thinks exists */
664 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 		rep.th.doff = arg.iov[0].iov_len / 4;
666 
667 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
668 				     key, ip_hdr(skb)->saddr,
669 				     ip_hdr(skb)->daddr, &rep.th);
670 	}
671 #endif
672 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 				      ip_hdr(skb)->saddr, /* XXX */
674 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
675 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
676 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677 
678 	/* When socket is gone, all binding information is lost.
679 	 * routing might fail in this case. No choice here, if we choose to force
680 	 * input interface, we will misroute in case of asymmetric route.
681 	 */
682 	if (sk)
683 		arg.bound_dev_if = sk->sk_bound_dev_if;
684 
685 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
686 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
687 
688 	arg.tos = ip_hdr(skb)->tos;
689 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
690 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
691 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
692 			      &arg, arg.iov[0].iov_len);
693 
694 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
695 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
696 
697 #ifdef CONFIG_TCP_MD5SIG
698 release_sk1:
699 	if (sk1) {
700 		rcu_read_unlock();
701 		sock_put(sk1);
702 	}
703 #endif
704 }
705 
706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
707    outside socket context is ugly, certainly. What can I do?
708  */
709 
710 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
711 			    u32 win, u32 tsval, u32 tsecr, int oif,
712 			    struct tcp_md5sig_key *key,
713 			    int reply_flags, u8 tos)
714 {
715 	const struct tcphdr *th = tcp_hdr(skb);
716 	struct {
717 		struct tcphdr th;
718 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
719 #ifdef CONFIG_TCP_MD5SIG
720 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
721 #endif
722 			];
723 	} rep;
724 	struct ip_reply_arg arg;
725 	struct net *net = dev_net(skb_dst(skb)->dev);
726 
727 	memset(&rep.th, 0, sizeof(struct tcphdr));
728 	memset(&arg, 0, sizeof(arg));
729 
730 	arg.iov[0].iov_base = (unsigned char *)&rep;
731 	arg.iov[0].iov_len  = sizeof(rep.th);
732 	if (tsecr) {
733 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
734 				   (TCPOPT_TIMESTAMP << 8) |
735 				   TCPOLEN_TIMESTAMP);
736 		rep.opt[1] = htonl(tsval);
737 		rep.opt[2] = htonl(tsecr);
738 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
739 	}
740 
741 	/* Swap the send and the receive. */
742 	rep.th.dest    = th->source;
743 	rep.th.source  = th->dest;
744 	rep.th.doff    = arg.iov[0].iov_len / 4;
745 	rep.th.seq     = htonl(seq);
746 	rep.th.ack_seq = htonl(ack);
747 	rep.th.ack     = 1;
748 	rep.th.window  = htons(win);
749 
750 #ifdef CONFIG_TCP_MD5SIG
751 	if (key) {
752 		int offset = (tsecr) ? 3 : 0;
753 
754 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
755 					  (TCPOPT_NOP << 16) |
756 					  (TCPOPT_MD5SIG << 8) |
757 					  TCPOLEN_MD5SIG);
758 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
759 		rep.th.doff = arg.iov[0].iov_len/4;
760 
761 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
762 				    key, ip_hdr(skb)->saddr,
763 				    ip_hdr(skb)->daddr, &rep.th);
764 	}
765 #endif
766 	arg.flags = reply_flags;
767 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
768 				      ip_hdr(skb)->saddr, /* XXX */
769 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
770 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
771 	if (oif)
772 		arg.bound_dev_if = oif;
773 	arg.tos = tos;
774 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
775 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
776 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777 			      &arg, arg.iov[0].iov_len);
778 
779 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
780 }
781 
782 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
783 {
784 	struct inet_timewait_sock *tw = inet_twsk(sk);
785 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
786 
787 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
788 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
789 			tcp_time_stamp + tcptw->tw_ts_offset,
790 			tcptw->tw_ts_recent,
791 			tw->tw_bound_dev_if,
792 			tcp_twsk_md5_key(tcptw),
793 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
794 			tw->tw_tos
795 			);
796 
797 	inet_twsk_put(tw);
798 }
799 
800 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
801 				  struct request_sock *req)
802 {
803 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
804 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
805 	 */
806 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
807 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
808 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
809 			tcp_time_stamp,
810 			req->ts_recent,
811 			0,
812 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
813 					  AF_INET),
814 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
815 			ip_hdr(skb)->tos);
816 }
817 
818 /*
819  *	Send a SYN-ACK after having received a SYN.
820  *	This still operates on a request_sock only, not on a big
821  *	socket.
822  */
823 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
824 			      struct flowi *fl,
825 			      struct request_sock *req,
826 			      struct tcp_fastopen_cookie *foc,
827 				  bool attach_req)
828 {
829 	const struct inet_request_sock *ireq = inet_rsk(req);
830 	struct flowi4 fl4;
831 	int err = -1;
832 	struct sk_buff *skb;
833 
834 	/* First, grab a route. */
835 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
836 		return -1;
837 
838 	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
839 
840 	if (skb) {
841 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
842 
843 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
844 					    ireq->ir_rmt_addr,
845 					    ireq->opt);
846 		err = net_xmit_eval(err);
847 	}
848 
849 	return err;
850 }
851 
852 /*
853  *	IPv4 request_sock destructor.
854  */
855 static void tcp_v4_reqsk_destructor(struct request_sock *req)
856 {
857 	kfree(inet_rsk(req)->opt);
858 }
859 
860 
861 #ifdef CONFIG_TCP_MD5SIG
862 /*
863  * RFC2385 MD5 checksumming requires a mapping of
864  * IP address->MD5 Key.
865  * We need to maintain these in the sk structure.
866  */
867 
868 /* Find the Key structure for an address.  */
869 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
870 					 const union tcp_md5_addr *addr,
871 					 int family)
872 {
873 	const struct tcp_sock *tp = tcp_sk(sk);
874 	struct tcp_md5sig_key *key;
875 	unsigned int size = sizeof(struct in_addr);
876 	const struct tcp_md5sig_info *md5sig;
877 
878 	/* caller either holds rcu_read_lock() or socket lock */
879 	md5sig = rcu_dereference_check(tp->md5sig_info,
880 				       sock_owned_by_user(sk) ||
881 				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
882 	if (!md5sig)
883 		return NULL;
884 #if IS_ENABLED(CONFIG_IPV6)
885 	if (family == AF_INET6)
886 		size = sizeof(struct in6_addr);
887 #endif
888 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
889 		if (key->family != family)
890 			continue;
891 		if (!memcmp(&key->addr, addr, size))
892 			return key;
893 	}
894 	return NULL;
895 }
896 EXPORT_SYMBOL(tcp_md5_do_lookup);
897 
898 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
899 					 const struct sock *addr_sk)
900 {
901 	const union tcp_md5_addr *addr;
902 
903 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
904 	return tcp_md5_do_lookup(sk, addr, AF_INET);
905 }
906 EXPORT_SYMBOL(tcp_v4_md5_lookup);
907 
908 /* This can be called on a newly created socket, from other files */
909 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
910 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
911 {
912 	/* Add Key to the list */
913 	struct tcp_md5sig_key *key;
914 	struct tcp_sock *tp = tcp_sk(sk);
915 	struct tcp_md5sig_info *md5sig;
916 
917 	key = tcp_md5_do_lookup(sk, addr, family);
918 	if (key) {
919 		/* Pre-existing entry - just update that one. */
920 		memcpy(key->key, newkey, newkeylen);
921 		key->keylen = newkeylen;
922 		return 0;
923 	}
924 
925 	md5sig = rcu_dereference_protected(tp->md5sig_info,
926 					   sock_owned_by_user(sk) ||
927 					   lockdep_is_held(&sk->sk_lock.slock));
928 	if (!md5sig) {
929 		md5sig = kmalloc(sizeof(*md5sig), gfp);
930 		if (!md5sig)
931 			return -ENOMEM;
932 
933 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
934 		INIT_HLIST_HEAD(&md5sig->head);
935 		rcu_assign_pointer(tp->md5sig_info, md5sig);
936 	}
937 
938 	key = sock_kmalloc(sk, sizeof(*key), gfp);
939 	if (!key)
940 		return -ENOMEM;
941 	if (!tcp_alloc_md5sig_pool()) {
942 		sock_kfree_s(sk, key, sizeof(*key));
943 		return -ENOMEM;
944 	}
945 
946 	memcpy(key->key, newkey, newkeylen);
947 	key->keylen = newkeylen;
948 	key->family = family;
949 	memcpy(&key->addr, addr,
950 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
951 				      sizeof(struct in_addr));
952 	hlist_add_head_rcu(&key->node, &md5sig->head);
953 	return 0;
954 }
955 EXPORT_SYMBOL(tcp_md5_do_add);
956 
957 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
958 {
959 	struct tcp_md5sig_key *key;
960 
961 	key = tcp_md5_do_lookup(sk, addr, family);
962 	if (!key)
963 		return -ENOENT;
964 	hlist_del_rcu(&key->node);
965 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
966 	kfree_rcu(key, rcu);
967 	return 0;
968 }
969 EXPORT_SYMBOL(tcp_md5_do_del);
970 
971 static void tcp_clear_md5_list(struct sock *sk)
972 {
973 	struct tcp_sock *tp = tcp_sk(sk);
974 	struct tcp_md5sig_key *key;
975 	struct hlist_node *n;
976 	struct tcp_md5sig_info *md5sig;
977 
978 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
979 
980 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
981 		hlist_del_rcu(&key->node);
982 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
983 		kfree_rcu(key, rcu);
984 	}
985 }
986 
987 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
988 				 int optlen)
989 {
990 	struct tcp_md5sig cmd;
991 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
992 
993 	if (optlen < sizeof(cmd))
994 		return -EINVAL;
995 
996 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
997 		return -EFAULT;
998 
999 	if (sin->sin_family != AF_INET)
1000 		return -EINVAL;
1001 
1002 	if (!cmd.tcpm_keylen)
1003 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1004 				      AF_INET);
1005 
1006 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1007 		return -EINVAL;
1008 
1009 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1010 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1011 			      GFP_KERNEL);
1012 }
1013 
1014 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1015 					__be32 daddr, __be32 saddr, int nbytes)
1016 {
1017 	struct tcp4_pseudohdr *bp;
1018 	struct scatterlist sg;
1019 
1020 	bp = &hp->md5_blk.ip4;
1021 
1022 	/*
1023 	 * 1. the TCP pseudo-header (in the order: source IP address,
1024 	 * destination IP address, zero-padded protocol number, and
1025 	 * segment length)
1026 	 */
1027 	bp->saddr = saddr;
1028 	bp->daddr = daddr;
1029 	bp->pad = 0;
1030 	bp->protocol = IPPROTO_TCP;
1031 	bp->len = cpu_to_be16(nbytes);
1032 
1033 	sg_init_one(&sg, bp, sizeof(*bp));
1034 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1035 }
1036 
1037 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1038 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1039 {
1040 	struct tcp_md5sig_pool *hp;
1041 	struct hash_desc *desc;
1042 
1043 	hp = tcp_get_md5sig_pool();
1044 	if (!hp)
1045 		goto clear_hash_noput;
1046 	desc = &hp->md5_desc;
1047 
1048 	if (crypto_hash_init(desc))
1049 		goto clear_hash;
1050 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1051 		goto clear_hash;
1052 	if (tcp_md5_hash_header(hp, th))
1053 		goto clear_hash;
1054 	if (tcp_md5_hash_key(hp, key))
1055 		goto clear_hash;
1056 	if (crypto_hash_final(desc, md5_hash))
1057 		goto clear_hash;
1058 
1059 	tcp_put_md5sig_pool();
1060 	return 0;
1061 
1062 clear_hash:
1063 	tcp_put_md5sig_pool();
1064 clear_hash_noput:
1065 	memset(md5_hash, 0, 16);
1066 	return 1;
1067 }
1068 
1069 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1070 			const struct sock *sk,
1071 			const struct sk_buff *skb)
1072 {
1073 	struct tcp_md5sig_pool *hp;
1074 	struct hash_desc *desc;
1075 	const struct tcphdr *th = tcp_hdr(skb);
1076 	__be32 saddr, daddr;
1077 
1078 	if (sk) { /* valid for establish/request sockets */
1079 		saddr = sk->sk_rcv_saddr;
1080 		daddr = sk->sk_daddr;
1081 	} else {
1082 		const struct iphdr *iph = ip_hdr(skb);
1083 		saddr = iph->saddr;
1084 		daddr = iph->daddr;
1085 	}
1086 
1087 	hp = tcp_get_md5sig_pool();
1088 	if (!hp)
1089 		goto clear_hash_noput;
1090 	desc = &hp->md5_desc;
1091 
1092 	if (crypto_hash_init(desc))
1093 		goto clear_hash;
1094 
1095 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1096 		goto clear_hash;
1097 	if (tcp_md5_hash_header(hp, th))
1098 		goto clear_hash;
1099 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1100 		goto clear_hash;
1101 	if (tcp_md5_hash_key(hp, key))
1102 		goto clear_hash;
1103 	if (crypto_hash_final(desc, md5_hash))
1104 		goto clear_hash;
1105 
1106 	tcp_put_md5sig_pool();
1107 	return 0;
1108 
1109 clear_hash:
1110 	tcp_put_md5sig_pool();
1111 clear_hash_noput:
1112 	memset(md5_hash, 0, 16);
1113 	return 1;
1114 }
1115 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1116 
1117 #endif
1118 
1119 /* Called with rcu_read_lock() */
1120 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1121 				    const struct sk_buff *skb)
1122 {
1123 #ifdef CONFIG_TCP_MD5SIG
1124 	/*
1125 	 * This gets called for each TCP segment that arrives
1126 	 * so we want to be efficient.
1127 	 * We have 3 drop cases:
1128 	 * o No MD5 hash and one expected.
1129 	 * o MD5 hash and we're not expecting one.
1130 	 * o MD5 hash and its wrong.
1131 	 */
1132 	const __u8 *hash_location = NULL;
1133 	struct tcp_md5sig_key *hash_expected;
1134 	const struct iphdr *iph = ip_hdr(skb);
1135 	const struct tcphdr *th = tcp_hdr(skb);
1136 	int genhash;
1137 	unsigned char newhash[16];
1138 
1139 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1140 					  AF_INET);
1141 	hash_location = tcp_parse_md5sig_option(th);
1142 
1143 	/* We've parsed the options - do we have a hash? */
1144 	if (!hash_expected && !hash_location)
1145 		return false;
1146 
1147 	if (hash_expected && !hash_location) {
1148 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1149 		return true;
1150 	}
1151 
1152 	if (!hash_expected && hash_location) {
1153 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1154 		return true;
1155 	}
1156 
1157 	/* Okay, so this is hash_expected and hash_location -
1158 	 * so we need to calculate the checksum.
1159 	 */
1160 	genhash = tcp_v4_md5_hash_skb(newhash,
1161 				      hash_expected,
1162 				      NULL, skb);
1163 
1164 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1165 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1166 				     &iph->saddr, ntohs(th->source),
1167 				     &iph->daddr, ntohs(th->dest),
1168 				     genhash ? " tcp_v4_calc_md5_hash failed"
1169 				     : "");
1170 		return true;
1171 	}
1172 	return false;
1173 #endif
1174 	return false;
1175 }
1176 
1177 static void tcp_v4_init_req(struct request_sock *req,
1178 			    const struct sock *sk_listener,
1179 			    struct sk_buff *skb)
1180 {
1181 	struct inet_request_sock *ireq = inet_rsk(req);
1182 
1183 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1184 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1185 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1186 	ireq->opt = tcp_v4_save_options(skb);
1187 }
1188 
1189 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1190 					  struct flowi *fl,
1191 					  const struct request_sock *req,
1192 					  bool *strict)
1193 {
1194 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1195 
1196 	if (strict) {
1197 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1198 			*strict = true;
1199 		else
1200 			*strict = false;
1201 	}
1202 
1203 	return dst;
1204 }
1205 
1206 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1207 	.family		=	PF_INET,
1208 	.obj_size	=	sizeof(struct tcp_request_sock),
1209 	.rtx_syn_ack	=	tcp_rtx_synack,
1210 	.send_ack	=	tcp_v4_reqsk_send_ack,
1211 	.destructor	=	tcp_v4_reqsk_destructor,
1212 	.send_reset	=	tcp_v4_send_reset,
1213 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1214 };
1215 
1216 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1217 	.mss_clamp	=	TCP_MSS_DEFAULT,
1218 #ifdef CONFIG_TCP_MD5SIG
1219 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1220 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1221 #endif
1222 	.init_req	=	tcp_v4_init_req,
1223 #ifdef CONFIG_SYN_COOKIES
1224 	.cookie_init_seq =	cookie_v4_init_sequence,
1225 #endif
1226 	.route_req	=	tcp_v4_route_req,
1227 	.init_seq	=	tcp_v4_init_sequence,
1228 	.send_synack	=	tcp_v4_send_synack,
1229 };
1230 
1231 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1232 {
1233 	/* Never answer to SYNs send to broadcast or multicast */
1234 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1235 		goto drop;
1236 
1237 	return tcp_conn_request(&tcp_request_sock_ops,
1238 				&tcp_request_sock_ipv4_ops, sk, skb);
1239 
1240 drop:
1241 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1242 	return 0;
1243 }
1244 EXPORT_SYMBOL(tcp_v4_conn_request);
1245 
1246 
1247 /*
1248  * The three way handshake has completed - we got a valid synack -
1249  * now create the new socket.
1250  */
1251 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1252 				  struct request_sock *req,
1253 				  struct dst_entry *dst,
1254 				  struct request_sock *req_unhash,
1255 				  bool *own_req)
1256 {
1257 	struct inet_request_sock *ireq;
1258 	struct inet_sock *newinet;
1259 	struct tcp_sock *newtp;
1260 	struct sock *newsk;
1261 #ifdef CONFIG_TCP_MD5SIG
1262 	struct tcp_md5sig_key *key;
1263 #endif
1264 	struct ip_options_rcu *inet_opt;
1265 
1266 	if (sk_acceptq_is_full(sk))
1267 		goto exit_overflow;
1268 
1269 	newsk = tcp_create_openreq_child(sk, req, skb);
1270 	if (!newsk)
1271 		goto exit_nonewsk;
1272 
1273 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1274 	inet_sk_rx_dst_set(newsk, skb);
1275 
1276 	newtp		      = tcp_sk(newsk);
1277 	newinet		      = inet_sk(newsk);
1278 	ireq		      = inet_rsk(req);
1279 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1280 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1281 	newsk->sk_bound_dev_if = ireq->ir_iif;
1282 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1283 	inet_opt	      = ireq->opt;
1284 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1285 	ireq->opt	      = NULL;
1286 	newinet->mc_index     = inet_iif(skb);
1287 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1288 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1289 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1290 	if (inet_opt)
1291 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1292 	newinet->inet_id = newtp->write_seq ^ jiffies;
1293 
1294 	if (!dst) {
1295 		dst = inet_csk_route_child_sock(sk, newsk, req);
1296 		if (!dst)
1297 			goto put_and_exit;
1298 	} else {
1299 		/* syncookie case : see end of cookie_v4_check() */
1300 	}
1301 	sk_setup_caps(newsk, dst);
1302 
1303 	tcp_ca_openreq_child(newsk, dst);
1304 
1305 	tcp_sync_mss(newsk, dst_mtu(dst));
1306 	newtp->advmss = dst_metric_advmss(dst);
1307 	if (tcp_sk(sk)->rx_opt.user_mss &&
1308 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1309 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1310 
1311 	tcp_initialize_rcv_mss(newsk);
1312 
1313 #ifdef CONFIG_TCP_MD5SIG
1314 	/* Copy over the MD5 key from the original socket */
1315 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1316 				AF_INET);
1317 	if (key) {
1318 		/*
1319 		 * We're using one, so create a matching key
1320 		 * on the newsk structure. If we fail to get
1321 		 * memory, then we end up not copying the key
1322 		 * across. Shucks.
1323 		 */
1324 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1325 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1326 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1327 	}
1328 #endif
1329 
1330 	if (__inet_inherit_port(sk, newsk) < 0)
1331 		goto put_and_exit;
1332 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1333 	if (*own_req)
1334 		tcp_move_syn(newtp, req);
1335 
1336 	return newsk;
1337 
1338 exit_overflow:
1339 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1340 exit_nonewsk:
1341 	dst_release(dst);
1342 exit:
1343 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1344 	return NULL;
1345 put_and_exit:
1346 	inet_csk_prepare_forced_close(newsk);
1347 	tcp_done(newsk);
1348 	goto exit;
1349 }
1350 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1351 
1352 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1353 {
1354 #ifdef CONFIG_SYN_COOKIES
1355 	const struct tcphdr *th = tcp_hdr(skb);
1356 
1357 	if (!th->syn)
1358 		sk = cookie_v4_check(sk, skb);
1359 #endif
1360 	return sk;
1361 }
1362 
1363 /* The socket must have it's spinlock held when we get
1364  * here, unless it is a TCP_LISTEN socket.
1365  *
1366  * We have a potential double-lock case here, so even when
1367  * doing backlog processing we use the BH locking scheme.
1368  * This is because we cannot sleep with the original spinlock
1369  * held.
1370  */
1371 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1372 {
1373 	struct sock *rsk;
1374 
1375 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1376 		struct dst_entry *dst = sk->sk_rx_dst;
1377 
1378 		sock_rps_save_rxhash(sk, skb);
1379 		sk_mark_napi_id(sk, skb);
1380 		if (dst) {
1381 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1382 			    !dst->ops->check(dst, 0)) {
1383 				dst_release(dst);
1384 				sk->sk_rx_dst = NULL;
1385 			}
1386 		}
1387 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1388 		return 0;
1389 	}
1390 
1391 	if (tcp_checksum_complete(skb))
1392 		goto csum_err;
1393 
1394 	if (sk->sk_state == TCP_LISTEN) {
1395 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1396 
1397 		if (!nsk)
1398 			goto discard;
1399 		if (nsk != sk) {
1400 			sock_rps_save_rxhash(nsk, skb);
1401 			sk_mark_napi_id(nsk, skb);
1402 			if (tcp_child_process(sk, nsk, skb)) {
1403 				rsk = nsk;
1404 				goto reset;
1405 			}
1406 			return 0;
1407 		}
1408 	} else
1409 		sock_rps_save_rxhash(sk, skb);
1410 
1411 	if (tcp_rcv_state_process(sk, skb)) {
1412 		rsk = sk;
1413 		goto reset;
1414 	}
1415 	return 0;
1416 
1417 reset:
1418 	tcp_v4_send_reset(rsk, skb);
1419 discard:
1420 	kfree_skb(skb);
1421 	/* Be careful here. If this function gets more complicated and
1422 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1423 	 * might be destroyed here. This current version compiles correctly,
1424 	 * but you have been warned.
1425 	 */
1426 	return 0;
1427 
1428 csum_err:
1429 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1430 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1431 	goto discard;
1432 }
1433 EXPORT_SYMBOL(tcp_v4_do_rcv);
1434 
1435 void tcp_v4_early_demux(struct sk_buff *skb)
1436 {
1437 	const struct iphdr *iph;
1438 	const struct tcphdr *th;
1439 	struct sock *sk;
1440 
1441 	if (skb->pkt_type != PACKET_HOST)
1442 		return;
1443 
1444 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1445 		return;
1446 
1447 	iph = ip_hdr(skb);
1448 	th = tcp_hdr(skb);
1449 
1450 	if (th->doff < sizeof(struct tcphdr) / 4)
1451 		return;
1452 
1453 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1454 				       iph->saddr, th->source,
1455 				       iph->daddr, ntohs(th->dest),
1456 				       skb->skb_iif);
1457 	if (sk) {
1458 		skb->sk = sk;
1459 		skb->destructor = sock_edemux;
1460 		if (sk_fullsock(sk)) {
1461 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1462 
1463 			if (dst)
1464 				dst = dst_check(dst, 0);
1465 			if (dst &&
1466 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1467 				skb_dst_set_noref(skb, dst);
1468 		}
1469 	}
1470 }
1471 
1472 /* Packet is added to VJ-style prequeue for processing in process
1473  * context, if a reader task is waiting. Apparently, this exciting
1474  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1475  * failed somewhere. Latency? Burstiness? Well, at least now we will
1476  * see, why it failed. 8)8)				  --ANK
1477  *
1478  */
1479 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1480 {
1481 	struct tcp_sock *tp = tcp_sk(sk);
1482 
1483 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1484 		return false;
1485 
1486 	if (skb->len <= tcp_hdrlen(skb) &&
1487 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1488 		return false;
1489 
1490 	/* Before escaping RCU protected region, we need to take care of skb
1491 	 * dst. Prequeue is only enabled for established sockets.
1492 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1493 	 * Instead of doing full sk_rx_dst validity here, let's perform
1494 	 * an optimistic check.
1495 	 */
1496 	if (likely(sk->sk_rx_dst))
1497 		skb_dst_drop(skb);
1498 	else
1499 		skb_dst_force_safe(skb);
1500 
1501 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1502 	tp->ucopy.memory += skb->truesize;
1503 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1504 		struct sk_buff *skb1;
1505 
1506 		BUG_ON(sock_owned_by_user(sk));
1507 
1508 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1509 			sk_backlog_rcv(sk, skb1);
1510 			NET_INC_STATS_BH(sock_net(sk),
1511 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1512 		}
1513 
1514 		tp->ucopy.memory = 0;
1515 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1516 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1517 					   POLLIN | POLLRDNORM | POLLRDBAND);
1518 		if (!inet_csk_ack_scheduled(sk))
1519 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1520 						  (3 * tcp_rto_min(sk)) / 4,
1521 						  TCP_RTO_MAX);
1522 	}
1523 	return true;
1524 }
1525 EXPORT_SYMBOL(tcp_prequeue);
1526 
1527 /*
1528  *	From tcp_input.c
1529  */
1530 
1531 int tcp_v4_rcv(struct sk_buff *skb)
1532 {
1533 	const struct iphdr *iph;
1534 	const struct tcphdr *th;
1535 	struct sock *sk;
1536 	int ret;
1537 	struct net *net = dev_net(skb->dev);
1538 
1539 	if (skb->pkt_type != PACKET_HOST)
1540 		goto discard_it;
1541 
1542 	/* Count it even if it's bad */
1543 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1544 
1545 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1546 		goto discard_it;
1547 
1548 	th = tcp_hdr(skb);
1549 
1550 	if (th->doff < sizeof(struct tcphdr) / 4)
1551 		goto bad_packet;
1552 	if (!pskb_may_pull(skb, th->doff * 4))
1553 		goto discard_it;
1554 
1555 	/* An explanation is required here, I think.
1556 	 * Packet length and doff are validated by header prediction,
1557 	 * provided case of th->doff==0 is eliminated.
1558 	 * So, we defer the checks. */
1559 
1560 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1561 		goto csum_error;
1562 
1563 	th = tcp_hdr(skb);
1564 	iph = ip_hdr(skb);
1565 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1566 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1567 	 */
1568 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1569 		sizeof(struct inet_skb_parm));
1570 	barrier();
1571 
1572 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1573 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1574 				    skb->len - th->doff * 4);
1575 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1576 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1577 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1578 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1579 	TCP_SKB_CB(skb)->sacked	 = 0;
1580 
1581 lookup:
1582 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1583 	if (!sk)
1584 		goto no_tcp_socket;
1585 
1586 process:
1587 	if (sk->sk_state == TCP_TIME_WAIT)
1588 		goto do_time_wait;
1589 
1590 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1591 		struct request_sock *req = inet_reqsk(sk);
1592 		struct sock *nsk = NULL;
1593 
1594 		sk = req->rsk_listener;
1595 		if (tcp_v4_inbound_md5_hash(sk, skb))
1596 			goto discard_and_relse;
1597 		if (likely(sk->sk_state == TCP_LISTEN)) {
1598 			nsk = tcp_check_req(sk, skb, req, false);
1599 		} else {
1600 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1601 			goto lookup;
1602 		}
1603 		if (!nsk) {
1604 			reqsk_put(req);
1605 			goto discard_it;
1606 		}
1607 		if (nsk == sk) {
1608 			sock_hold(sk);
1609 			reqsk_put(req);
1610 		} else if (tcp_child_process(sk, nsk, skb)) {
1611 			tcp_v4_send_reset(nsk, skb);
1612 			goto discard_it;
1613 		} else {
1614 			return 0;
1615 		}
1616 	}
1617 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1618 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1619 		goto discard_and_relse;
1620 	}
1621 
1622 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1623 		goto discard_and_relse;
1624 
1625 	if (tcp_v4_inbound_md5_hash(sk, skb))
1626 		goto discard_and_relse;
1627 
1628 	nf_reset(skb);
1629 
1630 	if (sk_filter(sk, skb))
1631 		goto discard_and_relse;
1632 
1633 	skb->dev = NULL;
1634 
1635 	if (sk->sk_state == TCP_LISTEN) {
1636 		ret = tcp_v4_do_rcv(sk, skb);
1637 		goto put_and_return;
1638 	}
1639 
1640 	sk_incoming_cpu_update(sk);
1641 
1642 	bh_lock_sock_nested(sk);
1643 	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1644 	ret = 0;
1645 	if (!sock_owned_by_user(sk)) {
1646 		if (!tcp_prequeue(sk, skb))
1647 			ret = tcp_v4_do_rcv(sk, skb);
1648 	} else if (unlikely(sk_add_backlog(sk, skb,
1649 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1650 		bh_unlock_sock(sk);
1651 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1652 		goto discard_and_relse;
1653 	}
1654 	bh_unlock_sock(sk);
1655 
1656 put_and_return:
1657 	sock_put(sk);
1658 
1659 	return ret;
1660 
1661 no_tcp_socket:
1662 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1663 		goto discard_it;
1664 
1665 	if (tcp_checksum_complete(skb)) {
1666 csum_error:
1667 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1668 bad_packet:
1669 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1670 	} else {
1671 		tcp_v4_send_reset(NULL, skb);
1672 	}
1673 
1674 discard_it:
1675 	/* Discard frame. */
1676 	kfree_skb(skb);
1677 	return 0;
1678 
1679 discard_and_relse:
1680 	sock_put(sk);
1681 	goto discard_it;
1682 
1683 do_time_wait:
1684 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1685 		inet_twsk_put(inet_twsk(sk));
1686 		goto discard_it;
1687 	}
1688 
1689 	if (tcp_checksum_complete(skb)) {
1690 		inet_twsk_put(inet_twsk(sk));
1691 		goto csum_error;
1692 	}
1693 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1694 	case TCP_TW_SYN: {
1695 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1696 							&tcp_hashinfo,
1697 							iph->saddr, th->source,
1698 							iph->daddr, th->dest,
1699 							inet_iif(skb));
1700 		if (sk2) {
1701 			inet_twsk_deschedule_put(inet_twsk(sk));
1702 			sk = sk2;
1703 			goto process;
1704 		}
1705 		/* Fall through to ACK */
1706 	}
1707 	case TCP_TW_ACK:
1708 		tcp_v4_timewait_ack(sk, skb);
1709 		break;
1710 	case TCP_TW_RST:
1711 		tcp_v4_send_reset(sk, skb);
1712 		inet_twsk_deschedule_put(inet_twsk(sk));
1713 		goto discard_it;
1714 	case TCP_TW_SUCCESS:;
1715 	}
1716 	goto discard_it;
1717 }
1718 
1719 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1720 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1721 	.twsk_unique	= tcp_twsk_unique,
1722 	.twsk_destructor= tcp_twsk_destructor,
1723 };
1724 
1725 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1726 {
1727 	struct dst_entry *dst = skb_dst(skb);
1728 
1729 	if (dst && dst_hold_safe(dst)) {
1730 		sk->sk_rx_dst = dst;
1731 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1732 	}
1733 }
1734 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1735 
1736 const struct inet_connection_sock_af_ops ipv4_specific = {
1737 	.queue_xmit	   = ip_queue_xmit,
1738 	.send_check	   = tcp_v4_send_check,
1739 	.rebuild_header	   = inet_sk_rebuild_header,
1740 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1741 	.conn_request	   = tcp_v4_conn_request,
1742 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1743 	.net_header_len	   = sizeof(struct iphdr),
1744 	.setsockopt	   = ip_setsockopt,
1745 	.getsockopt	   = ip_getsockopt,
1746 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1747 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1748 	.bind_conflict	   = inet_csk_bind_conflict,
1749 #ifdef CONFIG_COMPAT
1750 	.compat_setsockopt = compat_ip_setsockopt,
1751 	.compat_getsockopt = compat_ip_getsockopt,
1752 #endif
1753 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1754 };
1755 EXPORT_SYMBOL(ipv4_specific);
1756 
1757 #ifdef CONFIG_TCP_MD5SIG
1758 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1759 	.md5_lookup		= tcp_v4_md5_lookup,
1760 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1761 	.md5_parse		= tcp_v4_parse_md5_keys,
1762 };
1763 #endif
1764 
1765 /* NOTE: A lot of things set to zero explicitly by call to
1766  *       sk_alloc() so need not be done here.
1767  */
1768 static int tcp_v4_init_sock(struct sock *sk)
1769 {
1770 	struct inet_connection_sock *icsk = inet_csk(sk);
1771 
1772 	tcp_init_sock(sk);
1773 
1774 	icsk->icsk_af_ops = &ipv4_specific;
1775 
1776 #ifdef CONFIG_TCP_MD5SIG
1777 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1778 #endif
1779 
1780 	return 0;
1781 }
1782 
1783 void tcp_v4_destroy_sock(struct sock *sk)
1784 {
1785 	struct tcp_sock *tp = tcp_sk(sk);
1786 
1787 	tcp_clear_xmit_timers(sk);
1788 
1789 	tcp_cleanup_congestion_control(sk);
1790 
1791 	/* Cleanup up the write buffer. */
1792 	tcp_write_queue_purge(sk);
1793 
1794 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1795 	__skb_queue_purge(&tp->out_of_order_queue);
1796 
1797 #ifdef CONFIG_TCP_MD5SIG
1798 	/* Clean up the MD5 key list, if any */
1799 	if (tp->md5sig_info) {
1800 		tcp_clear_md5_list(sk);
1801 		kfree_rcu(tp->md5sig_info, rcu);
1802 		tp->md5sig_info = NULL;
1803 	}
1804 #endif
1805 
1806 	/* Clean prequeue, it must be empty really */
1807 	__skb_queue_purge(&tp->ucopy.prequeue);
1808 
1809 	/* Clean up a referenced TCP bind bucket. */
1810 	if (inet_csk(sk)->icsk_bind_hash)
1811 		inet_put_port(sk);
1812 
1813 	BUG_ON(tp->fastopen_rsk);
1814 
1815 	/* If socket is aborted during connect operation */
1816 	tcp_free_fastopen_req(tp);
1817 	tcp_saved_syn_free(tp);
1818 
1819 	sk_sockets_allocated_dec(sk);
1820 
1821 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1822 		sock_release_memcg(sk);
1823 }
1824 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1825 
1826 #ifdef CONFIG_PROC_FS
1827 /* Proc filesystem TCP sock list dumping. */
1828 
1829 /*
1830  * Get next listener socket follow cur.  If cur is NULL, get first socket
1831  * starting from bucket given in st->bucket; when st->bucket is zero the
1832  * very first socket in the hash table is returned.
1833  */
1834 static void *listening_get_next(struct seq_file *seq, void *cur)
1835 {
1836 	struct inet_connection_sock *icsk;
1837 	struct hlist_nulls_node *node;
1838 	struct sock *sk = cur;
1839 	struct inet_listen_hashbucket *ilb;
1840 	struct tcp_iter_state *st = seq->private;
1841 	struct net *net = seq_file_net(seq);
1842 
1843 	if (!sk) {
1844 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1845 		spin_lock_bh(&ilb->lock);
1846 		sk = sk_nulls_head(&ilb->head);
1847 		st->offset = 0;
1848 		goto get_sk;
1849 	}
1850 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1851 	++st->num;
1852 	++st->offset;
1853 
1854 	sk = sk_nulls_next(sk);
1855 get_sk:
1856 	sk_nulls_for_each_from(sk, node) {
1857 		if (!net_eq(sock_net(sk), net))
1858 			continue;
1859 		if (sk->sk_family == st->family) {
1860 			cur = sk;
1861 			goto out;
1862 		}
1863 		icsk = inet_csk(sk);
1864 	}
1865 	spin_unlock_bh(&ilb->lock);
1866 	st->offset = 0;
1867 	if (++st->bucket < INET_LHTABLE_SIZE) {
1868 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1869 		spin_lock_bh(&ilb->lock);
1870 		sk = sk_nulls_head(&ilb->head);
1871 		goto get_sk;
1872 	}
1873 	cur = NULL;
1874 out:
1875 	return cur;
1876 }
1877 
1878 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1879 {
1880 	struct tcp_iter_state *st = seq->private;
1881 	void *rc;
1882 
1883 	st->bucket = 0;
1884 	st->offset = 0;
1885 	rc = listening_get_next(seq, NULL);
1886 
1887 	while (rc && *pos) {
1888 		rc = listening_get_next(seq, rc);
1889 		--*pos;
1890 	}
1891 	return rc;
1892 }
1893 
1894 static inline bool empty_bucket(const struct tcp_iter_state *st)
1895 {
1896 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1897 }
1898 
1899 /*
1900  * Get first established socket starting from bucket given in st->bucket.
1901  * If st->bucket is zero, the very first socket in the hash is returned.
1902  */
1903 static void *established_get_first(struct seq_file *seq)
1904 {
1905 	struct tcp_iter_state *st = seq->private;
1906 	struct net *net = seq_file_net(seq);
1907 	void *rc = NULL;
1908 
1909 	st->offset = 0;
1910 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1911 		struct sock *sk;
1912 		struct hlist_nulls_node *node;
1913 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1914 
1915 		/* Lockless fast path for the common case of empty buckets */
1916 		if (empty_bucket(st))
1917 			continue;
1918 
1919 		spin_lock_bh(lock);
1920 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1921 			if (sk->sk_family != st->family ||
1922 			    !net_eq(sock_net(sk), net)) {
1923 				continue;
1924 			}
1925 			rc = sk;
1926 			goto out;
1927 		}
1928 		spin_unlock_bh(lock);
1929 	}
1930 out:
1931 	return rc;
1932 }
1933 
1934 static void *established_get_next(struct seq_file *seq, void *cur)
1935 {
1936 	struct sock *sk = cur;
1937 	struct hlist_nulls_node *node;
1938 	struct tcp_iter_state *st = seq->private;
1939 	struct net *net = seq_file_net(seq);
1940 
1941 	++st->num;
1942 	++st->offset;
1943 
1944 	sk = sk_nulls_next(sk);
1945 
1946 	sk_nulls_for_each_from(sk, node) {
1947 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1948 			return sk;
1949 	}
1950 
1951 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1952 	++st->bucket;
1953 	return established_get_first(seq);
1954 }
1955 
1956 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1957 {
1958 	struct tcp_iter_state *st = seq->private;
1959 	void *rc;
1960 
1961 	st->bucket = 0;
1962 	rc = established_get_first(seq);
1963 
1964 	while (rc && pos) {
1965 		rc = established_get_next(seq, rc);
1966 		--pos;
1967 	}
1968 	return rc;
1969 }
1970 
1971 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1972 {
1973 	void *rc;
1974 	struct tcp_iter_state *st = seq->private;
1975 
1976 	st->state = TCP_SEQ_STATE_LISTENING;
1977 	rc	  = listening_get_idx(seq, &pos);
1978 
1979 	if (!rc) {
1980 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1981 		rc	  = established_get_idx(seq, pos);
1982 	}
1983 
1984 	return rc;
1985 }
1986 
1987 static void *tcp_seek_last_pos(struct seq_file *seq)
1988 {
1989 	struct tcp_iter_state *st = seq->private;
1990 	int offset = st->offset;
1991 	int orig_num = st->num;
1992 	void *rc = NULL;
1993 
1994 	switch (st->state) {
1995 	case TCP_SEQ_STATE_LISTENING:
1996 		if (st->bucket >= INET_LHTABLE_SIZE)
1997 			break;
1998 		st->state = TCP_SEQ_STATE_LISTENING;
1999 		rc = listening_get_next(seq, NULL);
2000 		while (offset-- && rc)
2001 			rc = listening_get_next(seq, rc);
2002 		if (rc)
2003 			break;
2004 		st->bucket = 0;
2005 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2006 		/* Fallthrough */
2007 	case TCP_SEQ_STATE_ESTABLISHED:
2008 		if (st->bucket > tcp_hashinfo.ehash_mask)
2009 			break;
2010 		rc = established_get_first(seq);
2011 		while (offset-- && rc)
2012 			rc = established_get_next(seq, rc);
2013 	}
2014 
2015 	st->num = orig_num;
2016 
2017 	return rc;
2018 }
2019 
2020 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2021 {
2022 	struct tcp_iter_state *st = seq->private;
2023 	void *rc;
2024 
2025 	if (*pos && *pos == st->last_pos) {
2026 		rc = tcp_seek_last_pos(seq);
2027 		if (rc)
2028 			goto out;
2029 	}
2030 
2031 	st->state = TCP_SEQ_STATE_LISTENING;
2032 	st->num = 0;
2033 	st->bucket = 0;
2034 	st->offset = 0;
2035 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2036 
2037 out:
2038 	st->last_pos = *pos;
2039 	return rc;
2040 }
2041 
2042 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2043 {
2044 	struct tcp_iter_state *st = seq->private;
2045 	void *rc = NULL;
2046 
2047 	if (v == SEQ_START_TOKEN) {
2048 		rc = tcp_get_idx(seq, 0);
2049 		goto out;
2050 	}
2051 
2052 	switch (st->state) {
2053 	case TCP_SEQ_STATE_LISTENING:
2054 		rc = listening_get_next(seq, v);
2055 		if (!rc) {
2056 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2057 			st->bucket = 0;
2058 			st->offset = 0;
2059 			rc	  = established_get_first(seq);
2060 		}
2061 		break;
2062 	case TCP_SEQ_STATE_ESTABLISHED:
2063 		rc = established_get_next(seq, v);
2064 		break;
2065 	}
2066 out:
2067 	++*pos;
2068 	st->last_pos = *pos;
2069 	return rc;
2070 }
2071 
2072 static void tcp_seq_stop(struct seq_file *seq, void *v)
2073 {
2074 	struct tcp_iter_state *st = seq->private;
2075 
2076 	switch (st->state) {
2077 	case TCP_SEQ_STATE_LISTENING:
2078 		if (v != SEQ_START_TOKEN)
2079 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2080 		break;
2081 	case TCP_SEQ_STATE_ESTABLISHED:
2082 		if (v)
2083 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2084 		break;
2085 	}
2086 }
2087 
2088 int tcp_seq_open(struct inode *inode, struct file *file)
2089 {
2090 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2091 	struct tcp_iter_state *s;
2092 	int err;
2093 
2094 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2095 			  sizeof(struct tcp_iter_state));
2096 	if (err < 0)
2097 		return err;
2098 
2099 	s = ((struct seq_file *)file->private_data)->private;
2100 	s->family		= afinfo->family;
2101 	s->last_pos		= 0;
2102 	return 0;
2103 }
2104 EXPORT_SYMBOL(tcp_seq_open);
2105 
2106 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2107 {
2108 	int rc = 0;
2109 	struct proc_dir_entry *p;
2110 
2111 	afinfo->seq_ops.start		= tcp_seq_start;
2112 	afinfo->seq_ops.next		= tcp_seq_next;
2113 	afinfo->seq_ops.stop		= tcp_seq_stop;
2114 
2115 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2116 			     afinfo->seq_fops, afinfo);
2117 	if (!p)
2118 		rc = -ENOMEM;
2119 	return rc;
2120 }
2121 EXPORT_SYMBOL(tcp_proc_register);
2122 
2123 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2124 {
2125 	remove_proc_entry(afinfo->name, net->proc_net);
2126 }
2127 EXPORT_SYMBOL(tcp_proc_unregister);
2128 
2129 static void get_openreq4(const struct request_sock *req,
2130 			 struct seq_file *f, int i)
2131 {
2132 	const struct inet_request_sock *ireq = inet_rsk(req);
2133 	long delta = req->rsk_timer.expires - jiffies;
2134 
2135 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2136 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2137 		i,
2138 		ireq->ir_loc_addr,
2139 		ireq->ir_num,
2140 		ireq->ir_rmt_addr,
2141 		ntohs(ireq->ir_rmt_port),
2142 		TCP_SYN_RECV,
2143 		0, 0, /* could print option size, but that is af dependent. */
2144 		1,    /* timers active (only the expire timer) */
2145 		jiffies_delta_to_clock_t(delta),
2146 		req->num_timeout,
2147 		from_kuid_munged(seq_user_ns(f),
2148 				 sock_i_uid(req->rsk_listener)),
2149 		0,  /* non standard timer */
2150 		0, /* open_requests have no inode */
2151 		0,
2152 		req);
2153 }
2154 
2155 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2156 {
2157 	int timer_active;
2158 	unsigned long timer_expires;
2159 	const struct tcp_sock *tp = tcp_sk(sk);
2160 	const struct inet_connection_sock *icsk = inet_csk(sk);
2161 	const struct inet_sock *inet = inet_sk(sk);
2162 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2163 	__be32 dest = inet->inet_daddr;
2164 	__be32 src = inet->inet_rcv_saddr;
2165 	__u16 destp = ntohs(inet->inet_dport);
2166 	__u16 srcp = ntohs(inet->inet_sport);
2167 	int rx_queue;
2168 	int state;
2169 
2170 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2171 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2172 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2173 		timer_active	= 1;
2174 		timer_expires	= icsk->icsk_timeout;
2175 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2176 		timer_active	= 4;
2177 		timer_expires	= icsk->icsk_timeout;
2178 	} else if (timer_pending(&sk->sk_timer)) {
2179 		timer_active	= 2;
2180 		timer_expires	= sk->sk_timer.expires;
2181 	} else {
2182 		timer_active	= 0;
2183 		timer_expires = jiffies;
2184 	}
2185 
2186 	state = sk_state_load(sk);
2187 	if (state == TCP_LISTEN)
2188 		rx_queue = sk->sk_ack_backlog;
2189 	else
2190 		/* Because we don't lock the socket,
2191 		 * we might find a transient negative value.
2192 		 */
2193 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2194 
2195 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2196 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2197 		i, src, srcp, dest, destp, state,
2198 		tp->write_seq - tp->snd_una,
2199 		rx_queue,
2200 		timer_active,
2201 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2202 		icsk->icsk_retransmits,
2203 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2204 		icsk->icsk_probes_out,
2205 		sock_i_ino(sk),
2206 		atomic_read(&sk->sk_refcnt), sk,
2207 		jiffies_to_clock_t(icsk->icsk_rto),
2208 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2209 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2210 		tp->snd_cwnd,
2211 		state == TCP_LISTEN ?
2212 		    fastopenq->max_qlen :
2213 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2214 }
2215 
2216 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2217 			       struct seq_file *f, int i)
2218 {
2219 	long delta = tw->tw_timer.expires - jiffies;
2220 	__be32 dest, src;
2221 	__u16 destp, srcp;
2222 
2223 	dest  = tw->tw_daddr;
2224 	src   = tw->tw_rcv_saddr;
2225 	destp = ntohs(tw->tw_dport);
2226 	srcp  = ntohs(tw->tw_sport);
2227 
2228 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2229 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2230 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2231 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2232 		atomic_read(&tw->tw_refcnt), tw);
2233 }
2234 
2235 #define TMPSZ 150
2236 
2237 static int tcp4_seq_show(struct seq_file *seq, void *v)
2238 {
2239 	struct tcp_iter_state *st;
2240 	struct sock *sk = v;
2241 
2242 	seq_setwidth(seq, TMPSZ - 1);
2243 	if (v == SEQ_START_TOKEN) {
2244 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2245 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2246 			   "inode");
2247 		goto out;
2248 	}
2249 	st = seq->private;
2250 
2251 	if (sk->sk_state == TCP_TIME_WAIT)
2252 		get_timewait4_sock(v, seq, st->num);
2253 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2254 		get_openreq4(v, seq, st->num);
2255 	else
2256 		get_tcp4_sock(v, seq, st->num);
2257 out:
2258 	seq_pad(seq, '\n');
2259 	return 0;
2260 }
2261 
2262 static const struct file_operations tcp_afinfo_seq_fops = {
2263 	.owner   = THIS_MODULE,
2264 	.open    = tcp_seq_open,
2265 	.read    = seq_read,
2266 	.llseek  = seq_lseek,
2267 	.release = seq_release_net
2268 };
2269 
2270 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2271 	.name		= "tcp",
2272 	.family		= AF_INET,
2273 	.seq_fops	= &tcp_afinfo_seq_fops,
2274 	.seq_ops	= {
2275 		.show		= tcp4_seq_show,
2276 	},
2277 };
2278 
2279 static int __net_init tcp4_proc_init_net(struct net *net)
2280 {
2281 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2282 }
2283 
2284 static void __net_exit tcp4_proc_exit_net(struct net *net)
2285 {
2286 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2287 }
2288 
2289 static struct pernet_operations tcp4_net_ops = {
2290 	.init = tcp4_proc_init_net,
2291 	.exit = tcp4_proc_exit_net,
2292 };
2293 
2294 int __init tcp4_proc_init(void)
2295 {
2296 	return register_pernet_subsys(&tcp4_net_ops);
2297 }
2298 
2299 void tcp4_proc_exit(void)
2300 {
2301 	unregister_pernet_subsys(&tcp4_net_ops);
2302 }
2303 #endif /* CONFIG_PROC_FS */
2304 
2305 struct proto tcp_prot = {
2306 	.name			= "TCP",
2307 	.owner			= THIS_MODULE,
2308 	.close			= tcp_close,
2309 	.connect		= tcp_v4_connect,
2310 	.disconnect		= tcp_disconnect,
2311 	.accept			= inet_csk_accept,
2312 	.ioctl			= tcp_ioctl,
2313 	.init			= tcp_v4_init_sock,
2314 	.destroy		= tcp_v4_destroy_sock,
2315 	.shutdown		= tcp_shutdown,
2316 	.setsockopt		= tcp_setsockopt,
2317 	.getsockopt		= tcp_getsockopt,
2318 	.recvmsg		= tcp_recvmsg,
2319 	.sendmsg		= tcp_sendmsg,
2320 	.sendpage		= tcp_sendpage,
2321 	.backlog_rcv		= tcp_v4_do_rcv,
2322 	.release_cb		= tcp_release_cb,
2323 	.hash			= inet_hash,
2324 	.unhash			= inet_unhash,
2325 	.get_port		= inet_csk_get_port,
2326 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2327 	.stream_memory_free	= tcp_stream_memory_free,
2328 	.sockets_allocated	= &tcp_sockets_allocated,
2329 	.orphan_count		= &tcp_orphan_count,
2330 	.memory_allocated	= &tcp_memory_allocated,
2331 	.memory_pressure	= &tcp_memory_pressure,
2332 	.sysctl_mem		= sysctl_tcp_mem,
2333 	.sysctl_wmem		= sysctl_tcp_wmem,
2334 	.sysctl_rmem		= sysctl_tcp_rmem,
2335 	.max_header		= MAX_TCP_HEADER,
2336 	.obj_size		= sizeof(struct tcp_sock),
2337 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2338 	.twsk_prot		= &tcp_timewait_sock_ops,
2339 	.rsk_prot		= &tcp_request_sock_ops,
2340 	.h.hashinfo		= &tcp_hashinfo,
2341 	.no_autobind		= true,
2342 #ifdef CONFIG_COMPAT
2343 	.compat_setsockopt	= compat_tcp_setsockopt,
2344 	.compat_getsockopt	= compat_tcp_getsockopt,
2345 #endif
2346 	.diag_destroy		= tcp_abort,
2347 };
2348 EXPORT_SYMBOL(tcp_prot);
2349 
2350 static void __net_exit tcp_sk_exit(struct net *net)
2351 {
2352 	int cpu;
2353 
2354 	for_each_possible_cpu(cpu)
2355 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2356 	free_percpu(net->ipv4.tcp_sk);
2357 }
2358 
2359 static int __net_init tcp_sk_init(struct net *net)
2360 {
2361 	int res, cpu;
2362 
2363 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2364 	if (!net->ipv4.tcp_sk)
2365 		return -ENOMEM;
2366 
2367 	for_each_possible_cpu(cpu) {
2368 		struct sock *sk;
2369 
2370 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2371 					   IPPROTO_TCP, net);
2372 		if (res)
2373 			goto fail;
2374 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2375 	}
2376 
2377 	net->ipv4.sysctl_tcp_ecn = 2;
2378 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2379 
2380 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2381 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2382 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2383 
2384 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2385 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2386 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2387 
2388 	return 0;
2389 fail:
2390 	tcp_sk_exit(net);
2391 
2392 	return res;
2393 }
2394 
2395 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2396 {
2397 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2398 }
2399 
2400 static struct pernet_operations __net_initdata tcp_sk_ops = {
2401        .init	   = tcp_sk_init,
2402        .exit	   = tcp_sk_exit,
2403        .exit_batch = tcp_sk_exit_batch,
2404 };
2405 
2406 void __init tcp_v4_init(void)
2407 {
2408 	inet_hashinfo_init(&tcp_hashinfo);
2409 	if (register_pernet_subsys(&tcp_sk_ops))
2410 		panic("Failed to create the TCP control socket.\n");
2411 }
2412