xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 6c9e92476bc924ede6d6d2f0bfed2c06ae148d29)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99 
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 					  ip_hdr(skb)->saddr,
104 					  tcp_hdr(skb)->dest,
105 					  tcp_hdr(skb)->source);
106 }
107 
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	/* With PAWS, it is safe from the viewpoint
114 	   of data integrity. Even without PAWS it is safe provided sequence
115 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 
117 	   Actually, the idea is close to VJ's one, only timestamp cache is
118 	   held not per host, but per port pair and TW bucket is used as state
119 	   holder.
120 
121 	   If TW bucket has been already destroyed we fall back to VJ's scheme
122 	   and use initial timestamp retrieved from peer table.
123 	 */
124 	if (tcptw->tw_ts_recent_stamp &&
125 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
126 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 		if (tp->write_seq == 0)
129 			tp->write_seq = 1;
130 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 		sock_hold(sktw);
133 		return 1;
134 	}
135 
136 	return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139 
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 	struct inet_sock *inet = inet_sk(sk);
145 	struct tcp_sock *tp = tcp_sk(sk);
146 	__be16 orig_sport, orig_dport;
147 	__be32 daddr, nexthop;
148 	struct flowi4 *fl4;
149 	struct rtable *rt;
150 	int err;
151 	struct ip_options_rcu *inet_opt;
152 
153 	if (addr_len < sizeof(struct sockaddr_in))
154 		return -EINVAL;
155 
156 	if (usin->sin_family != AF_INET)
157 		return -EAFNOSUPPORT;
158 
159 	nexthop = daddr = usin->sin_addr.s_addr;
160 	inet_opt = rcu_dereference_protected(inet->inet_opt,
161 					     sock_owned_by_user(sk));
162 	if (inet_opt && inet_opt->opt.srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet_opt->opt.faddr;
166 	}
167 
168 	orig_sport = inet->inet_sport;
169 	orig_dport = usin->sin_port;
170 	fl4 = &inet->cork.fl.u.ip4;
171 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			      IPPROTO_TCP,
174 			      orig_sport, orig_dport, sk);
175 	if (IS_ERR(rt)) {
176 		err = PTR_ERR(rt);
177 		if (err == -ENETUNREACH)
178 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 		return err;
180 	}
181 
182 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 		ip_rt_put(rt);
184 		return -ENETUNREACH;
185 	}
186 
187 	if (!inet_opt || !inet_opt->opt.srr)
188 		daddr = fl4->daddr;
189 
190 	if (!inet->inet_saddr)
191 		inet->inet_saddr = fl4->saddr;
192 	inet->inet_rcv_saddr = inet->inet_saddr;
193 
194 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 		/* Reset inherited state */
196 		tp->rx_opt.ts_recent	   = 0;
197 		tp->rx_opt.ts_recent_stamp = 0;
198 		if (likely(!tp->repair))
199 			tp->write_seq	   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 		tcp_fetch_timewait_stamp(sk, &rt->dst);
205 
206 	inet->inet_dport = usin->sin_port;
207 	inet->inet_daddr = daddr;
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(&tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	inet_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304 
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 	struct dst_entry *dst = __sk_dst_check(sk, 0);
308 
309 	if (dst)
310 		dst->ops->redirect(dst, sk, skb);
311 }
312 
313 /*
314  * This routine is called by the ICMP module when it gets some
315  * sort of error condition.  If err < 0 then the socket should
316  * be closed and the error returned to the user.  If err > 0
317  * it's just the icmp type << 8 | icmp code.  After adjustment
318  * header points to the first 8 bytes of the tcp header.  We need
319  * to find the appropriate port.
320  *
321  * The locking strategy used here is very "optimistic". When
322  * someone else accesses the socket the ICMP is just dropped
323  * and for some paths there is no check at all.
324  * A more general error queue to queue errors for later handling
325  * is probably better.
326  *
327  */
328 
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 {
331 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333 	struct inet_connection_sock *icsk;
334 	struct tcp_sock *tp;
335 	struct inet_sock *inet;
336 	const int type = icmp_hdr(icmp_skb)->type;
337 	const int code = icmp_hdr(icmp_skb)->code;
338 	struct sock *sk;
339 	struct sk_buff *skb;
340 	struct request_sock *fastopen;
341 	__u32 seq, snd_una;
342 	__u32 remaining;
343 	int err;
344 	struct net *net = dev_net(icmp_skb->dev);
345 
346 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
347 			iph->saddr, th->source, inet_iif(icmp_skb));
348 	if (!sk) {
349 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 		return;
351 	}
352 	if (sk->sk_state == TCP_TIME_WAIT) {
353 		inet_twsk_put(inet_twsk(sk));
354 		return;
355 	}
356 
357 	bh_lock_sock(sk);
358 	/* If too many ICMPs get dropped on busy
359 	 * servers this needs to be solved differently.
360 	 * We do take care of PMTU discovery (RFC1191) special case :
361 	 * we can receive locally generated ICMP messages while socket is held.
362 	 */
363 	if (sock_owned_by_user(sk)) {
364 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 	}
367 	if (sk->sk_state == TCP_CLOSE)
368 		goto out;
369 
370 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 		goto out;
373 	}
374 
375 	icsk = inet_csk(sk);
376 	tp = tcp_sk(sk);
377 	seq = ntohl(th->seq);
378 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 	fastopen = tp->fastopen_rsk;
380 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
381 	if (sk->sk_state != TCP_LISTEN &&
382 	    !between(seq, snd_una, tp->snd_nxt)) {
383 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
384 		goto out;
385 	}
386 
387 	switch (type) {
388 	case ICMP_REDIRECT:
389 		do_redirect(icmp_skb, sk);
390 		goto out;
391 	case ICMP_SOURCE_QUENCH:
392 		/* Just silently ignore these. */
393 		goto out;
394 	case ICMP_PARAMETERPROB:
395 		err = EPROTO;
396 		break;
397 	case ICMP_DEST_UNREACH:
398 		if (code > NR_ICMP_UNREACH)
399 			goto out;
400 
401 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 			/* We are not interested in TCP_LISTEN and open_requests
403 			 * (SYN-ACKs send out by Linux are always <576bytes so
404 			 * they should go through unfragmented).
405 			 */
406 			if (sk->sk_state == TCP_LISTEN)
407 				goto out;
408 
409 			tp->mtu_info = info;
410 			if (!sock_owned_by_user(sk)) {
411 				tcp_v4_mtu_reduced(sk);
412 			} else {
413 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 					sock_hold(sk);
415 			}
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff || fastopen)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 					       TCP_TIMEOUT_INIT;
434 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto -
440 			    min(icsk->icsk_rto,
441 				tcp_time_stamp - tcp_skb_timestamp(skb));
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
489 		goto out;
490 
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && fastopen->sk == NULL)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 #ifdef CONFIG_TCP_MD5SIG
627 	hash_location = tcp_parse_md5sig_option(th);
628 	if (!sk && hash_location) {
629 		/*
630 		 * active side is lost. Try to find listening socket through
631 		 * source port, and then find md5 key through listening socket.
632 		 * we are not loose security here:
633 		 * Incoming packet is checked with md5 hash with finding key,
634 		 * no RST generated if md5 hash doesn't match.
635 		 */
636 		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
637 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
638 					     th->source, ip_hdr(skb)->daddr,
639 					     ntohs(th->source), inet_iif(skb));
640 		/* don't send rst if it can't find key */
641 		if (!sk1)
642 			return;
643 		rcu_read_lock();
644 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
645 					&ip_hdr(skb)->saddr, AF_INET);
646 		if (!key)
647 			goto release_sk1;
648 
649 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
650 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
651 			goto release_sk1;
652 	} else {
653 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
654 					     &ip_hdr(skb)->saddr,
655 					     AF_INET) : NULL;
656 	}
657 
658 	if (key) {
659 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 				   (TCPOPT_NOP << 16) |
661 				   (TCPOPT_MD5SIG << 8) |
662 				   TCPOLEN_MD5SIG);
663 		/* Update length and the length the header thinks exists */
664 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 		rep.th.doff = arg.iov[0].iov_len / 4;
666 
667 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
668 				     key, ip_hdr(skb)->saddr,
669 				     ip_hdr(skb)->daddr, &rep.th);
670 	}
671 #endif
672 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 				      ip_hdr(skb)->saddr, /* XXX */
674 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
675 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
676 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677 	/* When socket is gone, all binding information is lost.
678 	 * routing might fail in this case. No choice here, if we choose to force
679 	 * input interface, we will misroute in case of asymmetric route.
680 	 */
681 	if (sk)
682 		arg.bound_dev_if = sk->sk_bound_dev_if;
683 
684 	net = dev_net(skb_dst(skb)->dev);
685 	arg.tos = ip_hdr(skb)->tos;
686 	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
687 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
688 			      &arg, arg.iov[0].iov_len);
689 
690 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
691 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
692 
693 #ifdef CONFIG_TCP_MD5SIG
694 release_sk1:
695 	if (sk1) {
696 		rcu_read_unlock();
697 		sock_put(sk1);
698 	}
699 #endif
700 }
701 
702 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
703    outside socket context is ugly, certainly. What can I do?
704  */
705 
706 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
707 			    u32 win, u32 tsval, u32 tsecr, int oif,
708 			    struct tcp_md5sig_key *key,
709 			    int reply_flags, u8 tos)
710 {
711 	const struct tcphdr *th = tcp_hdr(skb);
712 	struct {
713 		struct tcphdr th;
714 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
715 #ifdef CONFIG_TCP_MD5SIG
716 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
717 #endif
718 			];
719 	} rep;
720 	struct ip_reply_arg arg;
721 	struct net *net = dev_net(skb_dst(skb)->dev);
722 
723 	memset(&rep.th, 0, sizeof(struct tcphdr));
724 	memset(&arg, 0, sizeof(arg));
725 
726 	arg.iov[0].iov_base = (unsigned char *)&rep;
727 	arg.iov[0].iov_len  = sizeof(rep.th);
728 	if (tsecr) {
729 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
730 				   (TCPOPT_TIMESTAMP << 8) |
731 				   TCPOLEN_TIMESTAMP);
732 		rep.opt[1] = htonl(tsval);
733 		rep.opt[2] = htonl(tsecr);
734 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
735 	}
736 
737 	/* Swap the send and the receive. */
738 	rep.th.dest    = th->source;
739 	rep.th.source  = th->dest;
740 	rep.th.doff    = arg.iov[0].iov_len / 4;
741 	rep.th.seq     = htonl(seq);
742 	rep.th.ack_seq = htonl(ack);
743 	rep.th.ack     = 1;
744 	rep.th.window  = htons(win);
745 
746 #ifdef CONFIG_TCP_MD5SIG
747 	if (key) {
748 		int offset = (tsecr) ? 3 : 0;
749 
750 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
751 					  (TCPOPT_NOP << 16) |
752 					  (TCPOPT_MD5SIG << 8) |
753 					  TCPOLEN_MD5SIG);
754 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
755 		rep.th.doff = arg.iov[0].iov_len/4;
756 
757 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
758 				    key, ip_hdr(skb)->saddr,
759 				    ip_hdr(skb)->daddr, &rep.th);
760 	}
761 #endif
762 	arg.flags = reply_flags;
763 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
764 				      ip_hdr(skb)->saddr, /* XXX */
765 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
766 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
767 	if (oif)
768 		arg.bound_dev_if = oif;
769 	arg.tos = tos;
770 	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
771 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
772 			      &arg, arg.iov[0].iov_len);
773 
774 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
775 }
776 
777 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
778 {
779 	struct inet_timewait_sock *tw = inet_twsk(sk);
780 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
781 
782 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
783 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
784 			tcp_time_stamp + tcptw->tw_ts_offset,
785 			tcptw->tw_ts_recent,
786 			tw->tw_bound_dev_if,
787 			tcp_twsk_md5_key(tcptw),
788 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
789 			tw->tw_tos
790 			);
791 
792 	inet_twsk_put(tw);
793 }
794 
795 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
796 				  struct request_sock *req)
797 {
798 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
799 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
800 	 */
801 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
802 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
803 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
804 			tcp_time_stamp,
805 			req->ts_recent,
806 			0,
807 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
808 					  AF_INET),
809 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
810 			ip_hdr(skb)->tos);
811 }
812 
813 /*
814  *	Send a SYN-ACK after having received a SYN.
815  *	This still operates on a request_sock only, not on a big
816  *	socket.
817  */
818 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
819 			      struct flowi *fl,
820 			      struct request_sock *req,
821 			      u16 queue_mapping,
822 			      struct tcp_fastopen_cookie *foc)
823 {
824 	const struct inet_request_sock *ireq = inet_rsk(req);
825 	struct flowi4 fl4;
826 	int err = -1;
827 	struct sk_buff *skb;
828 
829 	/* First, grab a route. */
830 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
831 		return -1;
832 
833 	skb = tcp_make_synack(sk, dst, req, foc);
834 
835 	if (skb) {
836 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
837 
838 		skb_set_queue_mapping(skb, queue_mapping);
839 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
840 					    ireq->ir_rmt_addr,
841 					    ireq->opt);
842 		err = net_xmit_eval(err);
843 	}
844 
845 	return err;
846 }
847 
848 /*
849  *	IPv4 request_sock destructor.
850  */
851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
852 {
853 	kfree(inet_rsk(req)->opt);
854 }
855 
856 /*
857  * Return true if a syncookie should be sent
858  */
859 bool tcp_syn_flood_action(struct sock *sk,
860 			 const struct sk_buff *skb,
861 			 const char *proto)
862 {
863 	const char *msg = "Dropping request";
864 	bool want_cookie = false;
865 	struct listen_sock *lopt;
866 
867 #ifdef CONFIG_SYN_COOKIES
868 	if (sysctl_tcp_syncookies) {
869 		msg = "Sending cookies";
870 		want_cookie = true;
871 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
872 	} else
873 #endif
874 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
875 
876 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
877 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
878 		lopt->synflood_warned = 1;
879 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
880 			proto, ntohs(tcp_hdr(skb)->dest), msg);
881 	}
882 	return want_cookie;
883 }
884 EXPORT_SYMBOL(tcp_syn_flood_action);
885 
886 #ifdef CONFIG_TCP_MD5SIG
887 /*
888  * RFC2385 MD5 checksumming requires a mapping of
889  * IP address->MD5 Key.
890  * We need to maintain these in the sk structure.
891  */
892 
893 /* Find the Key structure for an address.  */
894 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
895 					 const union tcp_md5_addr *addr,
896 					 int family)
897 {
898 	struct tcp_sock *tp = tcp_sk(sk);
899 	struct tcp_md5sig_key *key;
900 	unsigned int size = sizeof(struct in_addr);
901 	struct tcp_md5sig_info *md5sig;
902 
903 	/* caller either holds rcu_read_lock() or socket lock */
904 	md5sig = rcu_dereference_check(tp->md5sig_info,
905 				       sock_owned_by_user(sk) ||
906 				       lockdep_is_held(&sk->sk_lock.slock));
907 	if (!md5sig)
908 		return NULL;
909 #if IS_ENABLED(CONFIG_IPV6)
910 	if (family == AF_INET6)
911 		size = sizeof(struct in6_addr);
912 #endif
913 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
914 		if (key->family != family)
915 			continue;
916 		if (!memcmp(&key->addr, addr, size))
917 			return key;
918 	}
919 	return NULL;
920 }
921 EXPORT_SYMBOL(tcp_md5_do_lookup);
922 
923 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
924 					 struct sock *addr_sk)
925 {
926 	union tcp_md5_addr *addr;
927 
928 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
929 	return tcp_md5_do_lookup(sk, addr, AF_INET);
930 }
931 EXPORT_SYMBOL(tcp_v4_md5_lookup);
932 
933 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
934 						      struct request_sock *req)
935 {
936 	union tcp_md5_addr *addr;
937 
938 	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
939 	return tcp_md5_do_lookup(sk, addr, AF_INET);
940 }
941 
942 /* This can be called on a newly created socket, from other files */
943 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
944 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
945 {
946 	/* Add Key to the list */
947 	struct tcp_md5sig_key *key;
948 	struct tcp_sock *tp = tcp_sk(sk);
949 	struct tcp_md5sig_info *md5sig;
950 
951 	key = tcp_md5_do_lookup(sk, addr, family);
952 	if (key) {
953 		/* Pre-existing entry - just update that one. */
954 		memcpy(key->key, newkey, newkeylen);
955 		key->keylen = newkeylen;
956 		return 0;
957 	}
958 
959 	md5sig = rcu_dereference_protected(tp->md5sig_info,
960 					   sock_owned_by_user(sk));
961 	if (!md5sig) {
962 		md5sig = kmalloc(sizeof(*md5sig), gfp);
963 		if (!md5sig)
964 			return -ENOMEM;
965 
966 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
967 		INIT_HLIST_HEAD(&md5sig->head);
968 		rcu_assign_pointer(tp->md5sig_info, md5sig);
969 	}
970 
971 	key = sock_kmalloc(sk, sizeof(*key), gfp);
972 	if (!key)
973 		return -ENOMEM;
974 	if (!tcp_alloc_md5sig_pool()) {
975 		sock_kfree_s(sk, key, sizeof(*key));
976 		return -ENOMEM;
977 	}
978 
979 	memcpy(key->key, newkey, newkeylen);
980 	key->keylen = newkeylen;
981 	key->family = family;
982 	memcpy(&key->addr, addr,
983 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
984 				      sizeof(struct in_addr));
985 	hlist_add_head_rcu(&key->node, &md5sig->head);
986 	return 0;
987 }
988 EXPORT_SYMBOL(tcp_md5_do_add);
989 
990 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
991 {
992 	struct tcp_md5sig_key *key;
993 
994 	key = tcp_md5_do_lookup(sk, addr, family);
995 	if (!key)
996 		return -ENOENT;
997 	hlist_del_rcu(&key->node);
998 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
999 	kfree_rcu(key, rcu);
1000 	return 0;
1001 }
1002 EXPORT_SYMBOL(tcp_md5_do_del);
1003 
1004 static void tcp_clear_md5_list(struct sock *sk)
1005 {
1006 	struct tcp_sock *tp = tcp_sk(sk);
1007 	struct tcp_md5sig_key *key;
1008 	struct hlist_node *n;
1009 	struct tcp_md5sig_info *md5sig;
1010 
1011 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1012 
1013 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1014 		hlist_del_rcu(&key->node);
1015 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1016 		kfree_rcu(key, rcu);
1017 	}
1018 }
1019 
1020 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1021 				 int optlen)
1022 {
1023 	struct tcp_md5sig cmd;
1024 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1025 
1026 	if (optlen < sizeof(cmd))
1027 		return -EINVAL;
1028 
1029 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1030 		return -EFAULT;
1031 
1032 	if (sin->sin_family != AF_INET)
1033 		return -EINVAL;
1034 
1035 	if (!cmd.tcpm_keylen)
1036 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1037 				      AF_INET);
1038 
1039 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1040 		return -EINVAL;
1041 
1042 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1043 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1044 			      GFP_KERNEL);
1045 }
1046 
1047 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1048 					__be32 daddr, __be32 saddr, int nbytes)
1049 {
1050 	struct tcp4_pseudohdr *bp;
1051 	struct scatterlist sg;
1052 
1053 	bp = &hp->md5_blk.ip4;
1054 
1055 	/*
1056 	 * 1. the TCP pseudo-header (in the order: source IP address,
1057 	 * destination IP address, zero-padded protocol number, and
1058 	 * segment length)
1059 	 */
1060 	bp->saddr = saddr;
1061 	bp->daddr = daddr;
1062 	bp->pad = 0;
1063 	bp->protocol = IPPROTO_TCP;
1064 	bp->len = cpu_to_be16(nbytes);
1065 
1066 	sg_init_one(&sg, bp, sizeof(*bp));
1067 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1068 }
1069 
1070 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1071 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1072 {
1073 	struct tcp_md5sig_pool *hp;
1074 	struct hash_desc *desc;
1075 
1076 	hp = tcp_get_md5sig_pool();
1077 	if (!hp)
1078 		goto clear_hash_noput;
1079 	desc = &hp->md5_desc;
1080 
1081 	if (crypto_hash_init(desc))
1082 		goto clear_hash;
1083 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1084 		goto clear_hash;
1085 	if (tcp_md5_hash_header(hp, th))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_key(hp, key))
1088 		goto clear_hash;
1089 	if (crypto_hash_final(desc, md5_hash))
1090 		goto clear_hash;
1091 
1092 	tcp_put_md5sig_pool();
1093 	return 0;
1094 
1095 clear_hash:
1096 	tcp_put_md5sig_pool();
1097 clear_hash_noput:
1098 	memset(md5_hash, 0, 16);
1099 	return 1;
1100 }
1101 
1102 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1103 			const struct sock *sk, const struct request_sock *req,
1104 			const struct sk_buff *skb)
1105 {
1106 	struct tcp_md5sig_pool *hp;
1107 	struct hash_desc *desc;
1108 	const struct tcphdr *th = tcp_hdr(skb);
1109 	__be32 saddr, daddr;
1110 
1111 	if (sk) {
1112 		saddr = inet_sk(sk)->inet_saddr;
1113 		daddr = inet_sk(sk)->inet_daddr;
1114 	} else if (req) {
1115 		saddr = inet_rsk(req)->ir_loc_addr;
1116 		daddr = inet_rsk(req)->ir_rmt_addr;
1117 	} else {
1118 		const struct iphdr *iph = ip_hdr(skb);
1119 		saddr = iph->saddr;
1120 		daddr = iph->daddr;
1121 	}
1122 
1123 	hp = tcp_get_md5sig_pool();
1124 	if (!hp)
1125 		goto clear_hash_noput;
1126 	desc = &hp->md5_desc;
1127 
1128 	if (crypto_hash_init(desc))
1129 		goto clear_hash;
1130 
1131 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1132 		goto clear_hash;
1133 	if (tcp_md5_hash_header(hp, th))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_key(hp, key))
1138 		goto clear_hash;
1139 	if (crypto_hash_final(desc, md5_hash))
1140 		goto clear_hash;
1141 
1142 	tcp_put_md5sig_pool();
1143 	return 0;
1144 
1145 clear_hash:
1146 	tcp_put_md5sig_pool();
1147 clear_hash_noput:
1148 	memset(md5_hash, 0, 16);
1149 	return 1;
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1152 
1153 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1154 				      const struct sk_buff *skb)
1155 {
1156 	/*
1157 	 * This gets called for each TCP segment that arrives
1158 	 * so we want to be efficient.
1159 	 * We have 3 drop cases:
1160 	 * o No MD5 hash and one expected.
1161 	 * o MD5 hash and we're not expecting one.
1162 	 * o MD5 hash and its wrong.
1163 	 */
1164 	const __u8 *hash_location = NULL;
1165 	struct tcp_md5sig_key *hash_expected;
1166 	const struct iphdr *iph = ip_hdr(skb);
1167 	const struct tcphdr *th = tcp_hdr(skb);
1168 	int genhash;
1169 	unsigned char newhash[16];
1170 
1171 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1172 					  AF_INET);
1173 	hash_location = tcp_parse_md5sig_option(th);
1174 
1175 	/* We've parsed the options - do we have a hash? */
1176 	if (!hash_expected && !hash_location)
1177 		return false;
1178 
1179 	if (hash_expected && !hash_location) {
1180 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1181 		return true;
1182 	}
1183 
1184 	if (!hash_expected && hash_location) {
1185 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1186 		return true;
1187 	}
1188 
1189 	/* Okay, so this is hash_expected and hash_location -
1190 	 * so we need to calculate the checksum.
1191 	 */
1192 	genhash = tcp_v4_md5_hash_skb(newhash,
1193 				      hash_expected,
1194 				      NULL, NULL, skb);
1195 
1196 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1197 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1198 				     &iph->saddr, ntohs(th->source),
1199 				     &iph->daddr, ntohs(th->dest),
1200 				     genhash ? " tcp_v4_calc_md5_hash failed"
1201 				     : "");
1202 		return true;
1203 	}
1204 	return false;
1205 }
1206 
1207 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1208 {
1209 	bool ret;
1210 
1211 	rcu_read_lock();
1212 	ret = __tcp_v4_inbound_md5_hash(sk, skb);
1213 	rcu_read_unlock();
1214 
1215 	return ret;
1216 }
1217 
1218 #endif
1219 
1220 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1221 			    struct sk_buff *skb)
1222 {
1223 	struct inet_request_sock *ireq = inet_rsk(req);
1224 
1225 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1226 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1227 	ireq->no_srccheck = inet_sk(sk)->transparent;
1228 	ireq->opt = tcp_v4_save_options(skb);
1229 }
1230 
1231 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1232 					  const struct request_sock *req,
1233 					  bool *strict)
1234 {
1235 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1236 
1237 	if (strict) {
1238 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1239 			*strict = true;
1240 		else
1241 			*strict = false;
1242 	}
1243 
1244 	return dst;
1245 }
1246 
1247 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1248 	.family		=	PF_INET,
1249 	.obj_size	=	sizeof(struct tcp_request_sock),
1250 	.rtx_syn_ack	=	tcp_rtx_synack,
1251 	.send_ack	=	tcp_v4_reqsk_send_ack,
1252 	.destructor	=	tcp_v4_reqsk_destructor,
1253 	.send_reset	=	tcp_v4_send_reset,
1254 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1255 };
1256 
1257 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1258 	.mss_clamp	=	TCP_MSS_DEFAULT,
1259 #ifdef CONFIG_TCP_MD5SIG
1260 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1261 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1262 #endif
1263 	.init_req	=	tcp_v4_init_req,
1264 #ifdef CONFIG_SYN_COOKIES
1265 	.cookie_init_seq =	cookie_v4_init_sequence,
1266 #endif
1267 	.route_req	=	tcp_v4_route_req,
1268 	.init_seq	=	tcp_v4_init_sequence,
1269 	.send_synack	=	tcp_v4_send_synack,
1270 	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,
1271 };
1272 
1273 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1274 {
1275 	/* Never answer to SYNs send to broadcast or multicast */
1276 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1277 		goto drop;
1278 
1279 	return tcp_conn_request(&tcp_request_sock_ops,
1280 				&tcp_request_sock_ipv4_ops, sk, skb);
1281 
1282 drop:
1283 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1284 	return 0;
1285 }
1286 EXPORT_SYMBOL(tcp_v4_conn_request);
1287 
1288 
1289 /*
1290  * The three way handshake has completed - we got a valid synack -
1291  * now create the new socket.
1292  */
1293 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1294 				  struct request_sock *req,
1295 				  struct dst_entry *dst)
1296 {
1297 	struct inet_request_sock *ireq;
1298 	struct inet_sock *newinet;
1299 	struct tcp_sock *newtp;
1300 	struct sock *newsk;
1301 #ifdef CONFIG_TCP_MD5SIG
1302 	struct tcp_md5sig_key *key;
1303 #endif
1304 	struct ip_options_rcu *inet_opt;
1305 
1306 	if (sk_acceptq_is_full(sk))
1307 		goto exit_overflow;
1308 
1309 	newsk = tcp_create_openreq_child(sk, req, skb);
1310 	if (!newsk)
1311 		goto exit_nonewsk;
1312 
1313 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1314 	inet_sk_rx_dst_set(newsk, skb);
1315 
1316 	newtp		      = tcp_sk(newsk);
1317 	newinet		      = inet_sk(newsk);
1318 	ireq		      = inet_rsk(req);
1319 	newinet->inet_daddr   = ireq->ir_rmt_addr;
1320 	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1321 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1322 	inet_opt	      = ireq->opt;
1323 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1324 	ireq->opt	      = NULL;
1325 	newinet->mc_index     = inet_iif(skb);
1326 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1327 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1328 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1329 	inet_set_txhash(newsk);
1330 	if (inet_opt)
1331 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1332 	newinet->inet_id = newtp->write_seq ^ jiffies;
1333 
1334 	if (!dst) {
1335 		dst = inet_csk_route_child_sock(sk, newsk, req);
1336 		if (!dst)
1337 			goto put_and_exit;
1338 	} else {
1339 		/* syncookie case : see end of cookie_v4_check() */
1340 	}
1341 	sk_setup_caps(newsk, dst);
1342 
1343 	tcp_sync_mss(newsk, dst_mtu(dst));
1344 	newtp->advmss = dst_metric_advmss(dst);
1345 	if (tcp_sk(sk)->rx_opt.user_mss &&
1346 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1347 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1348 
1349 	tcp_initialize_rcv_mss(newsk);
1350 
1351 #ifdef CONFIG_TCP_MD5SIG
1352 	/* Copy over the MD5 key from the original socket */
1353 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1354 				AF_INET);
1355 	if (key != NULL) {
1356 		/*
1357 		 * We're using one, so create a matching key
1358 		 * on the newsk structure. If we fail to get
1359 		 * memory, then we end up not copying the key
1360 		 * across. Shucks.
1361 		 */
1362 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1363 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1364 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1365 	}
1366 #endif
1367 
1368 	if (__inet_inherit_port(sk, newsk) < 0)
1369 		goto put_and_exit;
1370 	__inet_hash_nolisten(newsk, NULL);
1371 
1372 	return newsk;
1373 
1374 exit_overflow:
1375 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1376 exit_nonewsk:
1377 	dst_release(dst);
1378 exit:
1379 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1380 	return NULL;
1381 put_and_exit:
1382 	inet_csk_prepare_forced_close(newsk);
1383 	tcp_done(newsk);
1384 	goto exit;
1385 }
1386 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1387 
1388 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1389 {
1390 	struct tcphdr *th = tcp_hdr(skb);
1391 	const struct iphdr *iph = ip_hdr(skb);
1392 	struct sock *nsk;
1393 	struct request_sock **prev;
1394 	/* Find possible connection requests. */
1395 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1396 						       iph->saddr, iph->daddr);
1397 	if (req)
1398 		return tcp_check_req(sk, skb, req, prev, false);
1399 
1400 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1401 			th->source, iph->daddr, th->dest, inet_iif(skb));
1402 
1403 	if (nsk) {
1404 		if (nsk->sk_state != TCP_TIME_WAIT) {
1405 			bh_lock_sock(nsk);
1406 			return nsk;
1407 		}
1408 		inet_twsk_put(inet_twsk(nsk));
1409 		return NULL;
1410 	}
1411 
1412 #ifdef CONFIG_SYN_COOKIES
1413 	if (!th->syn)
1414 		sk = cookie_v4_check(sk, skb);
1415 #endif
1416 	return sk;
1417 }
1418 
1419 /* The socket must have it's spinlock held when we get
1420  * here.
1421  *
1422  * We have a potential double-lock case here, so even when
1423  * doing backlog processing we use the BH locking scheme.
1424  * This is because we cannot sleep with the original spinlock
1425  * held.
1426  */
1427 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1428 {
1429 	struct sock *rsk;
1430 
1431 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1432 		struct dst_entry *dst = sk->sk_rx_dst;
1433 
1434 		sock_rps_save_rxhash(sk, skb);
1435 		if (dst) {
1436 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1437 			    dst->ops->check(dst, 0) == NULL) {
1438 				dst_release(dst);
1439 				sk->sk_rx_dst = NULL;
1440 			}
1441 		}
1442 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1443 		return 0;
1444 	}
1445 
1446 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1447 		goto csum_err;
1448 
1449 	if (sk->sk_state == TCP_LISTEN) {
1450 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1451 		if (!nsk)
1452 			goto discard;
1453 
1454 		if (nsk != sk) {
1455 			sock_rps_save_rxhash(nsk, skb);
1456 			if (tcp_child_process(sk, nsk, skb)) {
1457 				rsk = nsk;
1458 				goto reset;
1459 			}
1460 			return 0;
1461 		}
1462 	} else
1463 		sock_rps_save_rxhash(sk, skb);
1464 
1465 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1466 		rsk = sk;
1467 		goto reset;
1468 	}
1469 	return 0;
1470 
1471 reset:
1472 	tcp_v4_send_reset(rsk, skb);
1473 discard:
1474 	kfree_skb(skb);
1475 	/* Be careful here. If this function gets more complicated and
1476 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1477 	 * might be destroyed here. This current version compiles correctly,
1478 	 * but you have been warned.
1479 	 */
1480 	return 0;
1481 
1482 csum_err:
1483 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1484 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1485 	goto discard;
1486 }
1487 EXPORT_SYMBOL(tcp_v4_do_rcv);
1488 
1489 void tcp_v4_early_demux(struct sk_buff *skb)
1490 {
1491 	const struct iphdr *iph;
1492 	const struct tcphdr *th;
1493 	struct sock *sk;
1494 
1495 	if (skb->pkt_type != PACKET_HOST)
1496 		return;
1497 
1498 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1499 		return;
1500 
1501 	iph = ip_hdr(skb);
1502 	th = tcp_hdr(skb);
1503 
1504 	if (th->doff < sizeof(struct tcphdr) / 4)
1505 		return;
1506 
1507 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1508 				       iph->saddr, th->source,
1509 				       iph->daddr, ntohs(th->dest),
1510 				       skb->skb_iif);
1511 	if (sk) {
1512 		skb->sk = sk;
1513 		skb->destructor = sock_edemux;
1514 		if (sk->sk_state != TCP_TIME_WAIT) {
1515 			struct dst_entry *dst = sk->sk_rx_dst;
1516 
1517 			if (dst)
1518 				dst = dst_check(dst, 0);
1519 			if (dst &&
1520 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1521 				skb_dst_set_noref(skb, dst);
1522 		}
1523 	}
1524 }
1525 
1526 /* Packet is added to VJ-style prequeue for processing in process
1527  * context, if a reader task is waiting. Apparently, this exciting
1528  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1529  * failed somewhere. Latency? Burstiness? Well, at least now we will
1530  * see, why it failed. 8)8)				  --ANK
1531  *
1532  */
1533 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1534 {
1535 	struct tcp_sock *tp = tcp_sk(sk);
1536 
1537 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1538 		return false;
1539 
1540 	if (skb->len <= tcp_hdrlen(skb) &&
1541 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1542 		return false;
1543 
1544 	/* Before escaping RCU protected region, we need to take care of skb
1545 	 * dst. Prequeue is only enabled for established sockets.
1546 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1547 	 * Instead of doing full sk_rx_dst validity here, let's perform
1548 	 * an optimistic check.
1549 	 */
1550 	if (likely(sk->sk_rx_dst))
1551 		skb_dst_drop(skb);
1552 	else
1553 		skb_dst_force(skb);
1554 
1555 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1556 	tp->ucopy.memory += skb->truesize;
1557 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1558 		struct sk_buff *skb1;
1559 
1560 		BUG_ON(sock_owned_by_user(sk));
1561 
1562 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1563 			sk_backlog_rcv(sk, skb1);
1564 			NET_INC_STATS_BH(sock_net(sk),
1565 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1566 		}
1567 
1568 		tp->ucopy.memory = 0;
1569 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1570 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1571 					   POLLIN | POLLRDNORM | POLLRDBAND);
1572 		if (!inet_csk_ack_scheduled(sk))
1573 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1574 						  (3 * tcp_rto_min(sk)) / 4,
1575 						  TCP_RTO_MAX);
1576 	}
1577 	return true;
1578 }
1579 EXPORT_SYMBOL(tcp_prequeue);
1580 
1581 /*
1582  *	From tcp_input.c
1583  */
1584 
1585 int tcp_v4_rcv(struct sk_buff *skb)
1586 {
1587 	const struct iphdr *iph;
1588 	const struct tcphdr *th;
1589 	struct sock *sk;
1590 	int ret;
1591 	struct net *net = dev_net(skb->dev);
1592 
1593 	if (skb->pkt_type != PACKET_HOST)
1594 		goto discard_it;
1595 
1596 	/* Count it even if it's bad */
1597 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1598 
1599 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1600 		goto discard_it;
1601 
1602 	th = tcp_hdr(skb);
1603 
1604 	if (th->doff < sizeof(struct tcphdr) / 4)
1605 		goto bad_packet;
1606 	if (!pskb_may_pull(skb, th->doff * 4))
1607 		goto discard_it;
1608 
1609 	/* An explanation is required here, I think.
1610 	 * Packet length and doff are validated by header prediction,
1611 	 * provided case of th->doff==0 is eliminated.
1612 	 * So, we defer the checks. */
1613 
1614 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1615 		goto csum_error;
1616 
1617 	th = tcp_hdr(skb);
1618 	iph = ip_hdr(skb);
1619 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1620 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1621 	 */
1622 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1623 		sizeof(struct inet_skb_parm));
1624 	barrier();
1625 
1626 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1627 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1628 				    skb->len - th->doff * 4);
1629 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1630 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1631 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1632 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1633 	TCP_SKB_CB(skb)->sacked	 = 0;
1634 
1635 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1636 	if (!sk)
1637 		goto no_tcp_socket;
1638 
1639 process:
1640 	if (sk->sk_state == TCP_TIME_WAIT)
1641 		goto do_time_wait;
1642 
1643 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1644 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1645 		goto discard_and_relse;
1646 	}
1647 
1648 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1649 		goto discard_and_relse;
1650 
1651 #ifdef CONFIG_TCP_MD5SIG
1652 	/*
1653 	 * We really want to reject the packet as early as possible
1654 	 * if:
1655 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1656 	 *  o There is an MD5 option and we're not expecting one
1657 	 */
1658 	if (tcp_v4_inbound_md5_hash(sk, skb))
1659 		goto discard_and_relse;
1660 #endif
1661 
1662 	nf_reset(skb);
1663 
1664 	if (sk_filter(sk, skb))
1665 		goto discard_and_relse;
1666 
1667 	sk_mark_napi_id(sk, skb);
1668 	skb->dev = NULL;
1669 
1670 	bh_lock_sock_nested(sk);
1671 	ret = 0;
1672 	if (!sock_owned_by_user(sk)) {
1673 		if (!tcp_prequeue(sk, skb))
1674 			ret = tcp_v4_do_rcv(sk, skb);
1675 	} else if (unlikely(sk_add_backlog(sk, skb,
1676 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1677 		bh_unlock_sock(sk);
1678 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1679 		goto discard_and_relse;
1680 	}
1681 	bh_unlock_sock(sk);
1682 
1683 	sock_put(sk);
1684 
1685 	return ret;
1686 
1687 no_tcp_socket:
1688 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1689 		goto discard_it;
1690 
1691 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1692 csum_error:
1693 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1694 bad_packet:
1695 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1696 	} else {
1697 		tcp_v4_send_reset(NULL, skb);
1698 	}
1699 
1700 discard_it:
1701 	/* Discard frame. */
1702 	kfree_skb(skb);
1703 	return 0;
1704 
1705 discard_and_relse:
1706 	sock_put(sk);
1707 	goto discard_it;
1708 
1709 do_time_wait:
1710 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1711 		inet_twsk_put(inet_twsk(sk));
1712 		goto discard_it;
1713 	}
1714 
1715 	if (skb->len < (th->doff << 2)) {
1716 		inet_twsk_put(inet_twsk(sk));
1717 		goto bad_packet;
1718 	}
1719 	if (tcp_checksum_complete(skb)) {
1720 		inet_twsk_put(inet_twsk(sk));
1721 		goto csum_error;
1722 	}
1723 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1724 	case TCP_TW_SYN: {
1725 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1726 							&tcp_hashinfo,
1727 							iph->saddr, th->source,
1728 							iph->daddr, th->dest,
1729 							inet_iif(skb));
1730 		if (sk2) {
1731 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1732 			inet_twsk_put(inet_twsk(sk));
1733 			sk = sk2;
1734 			goto process;
1735 		}
1736 		/* Fall through to ACK */
1737 	}
1738 	case TCP_TW_ACK:
1739 		tcp_v4_timewait_ack(sk, skb);
1740 		break;
1741 	case TCP_TW_RST:
1742 		goto no_tcp_socket;
1743 	case TCP_TW_SUCCESS:;
1744 	}
1745 	goto discard_it;
1746 }
1747 
1748 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1749 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1750 	.twsk_unique	= tcp_twsk_unique,
1751 	.twsk_destructor= tcp_twsk_destructor,
1752 };
1753 
1754 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1755 {
1756 	struct dst_entry *dst = skb_dst(skb);
1757 
1758 	if (dst) {
1759 		dst_hold(dst);
1760 		sk->sk_rx_dst = dst;
1761 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1762 	}
1763 }
1764 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1765 
1766 const struct inet_connection_sock_af_ops ipv4_specific = {
1767 	.queue_xmit	   = ip_queue_xmit,
1768 	.send_check	   = tcp_v4_send_check,
1769 	.rebuild_header	   = inet_sk_rebuild_header,
1770 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1771 	.conn_request	   = tcp_v4_conn_request,
1772 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1773 	.net_header_len	   = sizeof(struct iphdr),
1774 	.setsockopt	   = ip_setsockopt,
1775 	.getsockopt	   = ip_getsockopt,
1776 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1777 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1778 	.bind_conflict	   = inet_csk_bind_conflict,
1779 #ifdef CONFIG_COMPAT
1780 	.compat_setsockopt = compat_ip_setsockopt,
1781 	.compat_getsockopt = compat_ip_getsockopt,
1782 #endif
1783 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1784 };
1785 EXPORT_SYMBOL(ipv4_specific);
1786 
1787 #ifdef CONFIG_TCP_MD5SIG
1788 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1789 	.md5_lookup		= tcp_v4_md5_lookup,
1790 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1791 	.md5_parse		= tcp_v4_parse_md5_keys,
1792 };
1793 #endif
1794 
1795 /* NOTE: A lot of things set to zero explicitly by call to
1796  *       sk_alloc() so need not be done here.
1797  */
1798 static int tcp_v4_init_sock(struct sock *sk)
1799 {
1800 	struct inet_connection_sock *icsk = inet_csk(sk);
1801 
1802 	tcp_init_sock(sk);
1803 
1804 	icsk->icsk_af_ops = &ipv4_specific;
1805 
1806 #ifdef CONFIG_TCP_MD5SIG
1807 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1808 #endif
1809 
1810 	return 0;
1811 }
1812 
1813 void tcp_v4_destroy_sock(struct sock *sk)
1814 {
1815 	struct tcp_sock *tp = tcp_sk(sk);
1816 
1817 	tcp_clear_xmit_timers(sk);
1818 
1819 	tcp_cleanup_congestion_control(sk);
1820 
1821 	/* Cleanup up the write buffer. */
1822 	tcp_write_queue_purge(sk);
1823 
1824 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1825 	__skb_queue_purge(&tp->out_of_order_queue);
1826 
1827 #ifdef CONFIG_TCP_MD5SIG
1828 	/* Clean up the MD5 key list, if any */
1829 	if (tp->md5sig_info) {
1830 		tcp_clear_md5_list(sk);
1831 		kfree_rcu(tp->md5sig_info, rcu);
1832 		tp->md5sig_info = NULL;
1833 	}
1834 #endif
1835 
1836 	/* Clean prequeue, it must be empty really */
1837 	__skb_queue_purge(&tp->ucopy.prequeue);
1838 
1839 	/* Clean up a referenced TCP bind bucket. */
1840 	if (inet_csk(sk)->icsk_bind_hash)
1841 		inet_put_port(sk);
1842 
1843 	BUG_ON(tp->fastopen_rsk != NULL);
1844 
1845 	/* If socket is aborted during connect operation */
1846 	tcp_free_fastopen_req(tp);
1847 
1848 	sk_sockets_allocated_dec(sk);
1849 	sock_release_memcg(sk);
1850 }
1851 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1852 
1853 #ifdef CONFIG_PROC_FS
1854 /* Proc filesystem TCP sock list dumping. */
1855 
1856 /*
1857  * Get next listener socket follow cur.  If cur is NULL, get first socket
1858  * starting from bucket given in st->bucket; when st->bucket is zero the
1859  * very first socket in the hash table is returned.
1860  */
1861 static void *listening_get_next(struct seq_file *seq, void *cur)
1862 {
1863 	struct inet_connection_sock *icsk;
1864 	struct hlist_nulls_node *node;
1865 	struct sock *sk = cur;
1866 	struct inet_listen_hashbucket *ilb;
1867 	struct tcp_iter_state *st = seq->private;
1868 	struct net *net = seq_file_net(seq);
1869 
1870 	if (!sk) {
1871 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1872 		spin_lock_bh(&ilb->lock);
1873 		sk = sk_nulls_head(&ilb->head);
1874 		st->offset = 0;
1875 		goto get_sk;
1876 	}
1877 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1878 	++st->num;
1879 	++st->offset;
1880 
1881 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1882 		struct request_sock *req = cur;
1883 
1884 		icsk = inet_csk(st->syn_wait_sk);
1885 		req = req->dl_next;
1886 		while (1) {
1887 			while (req) {
1888 				if (req->rsk_ops->family == st->family) {
1889 					cur = req;
1890 					goto out;
1891 				}
1892 				req = req->dl_next;
1893 			}
1894 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1895 				break;
1896 get_req:
1897 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1898 		}
1899 		sk	  = sk_nulls_next(st->syn_wait_sk);
1900 		st->state = TCP_SEQ_STATE_LISTENING;
1901 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1902 	} else {
1903 		icsk = inet_csk(sk);
1904 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1905 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1906 			goto start_req;
1907 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1908 		sk = sk_nulls_next(sk);
1909 	}
1910 get_sk:
1911 	sk_nulls_for_each_from(sk, node) {
1912 		if (!net_eq(sock_net(sk), net))
1913 			continue;
1914 		if (sk->sk_family == st->family) {
1915 			cur = sk;
1916 			goto out;
1917 		}
1918 		icsk = inet_csk(sk);
1919 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1920 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1921 start_req:
1922 			st->uid		= sock_i_uid(sk);
1923 			st->syn_wait_sk = sk;
1924 			st->state	= TCP_SEQ_STATE_OPENREQ;
1925 			st->sbucket	= 0;
1926 			goto get_req;
1927 		}
1928 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1929 	}
1930 	spin_unlock_bh(&ilb->lock);
1931 	st->offset = 0;
1932 	if (++st->bucket < INET_LHTABLE_SIZE) {
1933 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1934 		spin_lock_bh(&ilb->lock);
1935 		sk = sk_nulls_head(&ilb->head);
1936 		goto get_sk;
1937 	}
1938 	cur = NULL;
1939 out:
1940 	return cur;
1941 }
1942 
1943 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1944 {
1945 	struct tcp_iter_state *st = seq->private;
1946 	void *rc;
1947 
1948 	st->bucket = 0;
1949 	st->offset = 0;
1950 	rc = listening_get_next(seq, NULL);
1951 
1952 	while (rc && *pos) {
1953 		rc = listening_get_next(seq, rc);
1954 		--*pos;
1955 	}
1956 	return rc;
1957 }
1958 
1959 static inline bool empty_bucket(const struct tcp_iter_state *st)
1960 {
1961 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1962 }
1963 
1964 /*
1965  * Get first established socket starting from bucket given in st->bucket.
1966  * If st->bucket is zero, the very first socket in the hash is returned.
1967  */
1968 static void *established_get_first(struct seq_file *seq)
1969 {
1970 	struct tcp_iter_state *st = seq->private;
1971 	struct net *net = seq_file_net(seq);
1972 	void *rc = NULL;
1973 
1974 	st->offset = 0;
1975 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1976 		struct sock *sk;
1977 		struct hlist_nulls_node *node;
1978 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1979 
1980 		/* Lockless fast path for the common case of empty buckets */
1981 		if (empty_bucket(st))
1982 			continue;
1983 
1984 		spin_lock_bh(lock);
1985 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1986 			if (sk->sk_family != st->family ||
1987 			    !net_eq(sock_net(sk), net)) {
1988 				continue;
1989 			}
1990 			rc = sk;
1991 			goto out;
1992 		}
1993 		spin_unlock_bh(lock);
1994 	}
1995 out:
1996 	return rc;
1997 }
1998 
1999 static void *established_get_next(struct seq_file *seq, void *cur)
2000 {
2001 	struct sock *sk = cur;
2002 	struct hlist_nulls_node *node;
2003 	struct tcp_iter_state *st = seq->private;
2004 	struct net *net = seq_file_net(seq);
2005 
2006 	++st->num;
2007 	++st->offset;
2008 
2009 	sk = sk_nulls_next(sk);
2010 
2011 	sk_nulls_for_each_from(sk, node) {
2012 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2013 			return sk;
2014 	}
2015 
2016 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2017 	++st->bucket;
2018 	return established_get_first(seq);
2019 }
2020 
2021 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2022 {
2023 	struct tcp_iter_state *st = seq->private;
2024 	void *rc;
2025 
2026 	st->bucket = 0;
2027 	rc = established_get_first(seq);
2028 
2029 	while (rc && pos) {
2030 		rc = established_get_next(seq, rc);
2031 		--pos;
2032 	}
2033 	return rc;
2034 }
2035 
2036 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2037 {
2038 	void *rc;
2039 	struct tcp_iter_state *st = seq->private;
2040 
2041 	st->state = TCP_SEQ_STATE_LISTENING;
2042 	rc	  = listening_get_idx(seq, &pos);
2043 
2044 	if (!rc) {
2045 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2046 		rc	  = established_get_idx(seq, pos);
2047 	}
2048 
2049 	return rc;
2050 }
2051 
2052 static void *tcp_seek_last_pos(struct seq_file *seq)
2053 {
2054 	struct tcp_iter_state *st = seq->private;
2055 	int offset = st->offset;
2056 	int orig_num = st->num;
2057 	void *rc = NULL;
2058 
2059 	switch (st->state) {
2060 	case TCP_SEQ_STATE_OPENREQ:
2061 	case TCP_SEQ_STATE_LISTENING:
2062 		if (st->bucket >= INET_LHTABLE_SIZE)
2063 			break;
2064 		st->state = TCP_SEQ_STATE_LISTENING;
2065 		rc = listening_get_next(seq, NULL);
2066 		while (offset-- && rc)
2067 			rc = listening_get_next(seq, rc);
2068 		if (rc)
2069 			break;
2070 		st->bucket = 0;
2071 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2072 		/* Fallthrough */
2073 	case TCP_SEQ_STATE_ESTABLISHED:
2074 		if (st->bucket > tcp_hashinfo.ehash_mask)
2075 			break;
2076 		rc = established_get_first(seq);
2077 		while (offset-- && rc)
2078 			rc = established_get_next(seq, rc);
2079 	}
2080 
2081 	st->num = orig_num;
2082 
2083 	return rc;
2084 }
2085 
2086 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2087 {
2088 	struct tcp_iter_state *st = seq->private;
2089 	void *rc;
2090 
2091 	if (*pos && *pos == st->last_pos) {
2092 		rc = tcp_seek_last_pos(seq);
2093 		if (rc)
2094 			goto out;
2095 	}
2096 
2097 	st->state = TCP_SEQ_STATE_LISTENING;
2098 	st->num = 0;
2099 	st->bucket = 0;
2100 	st->offset = 0;
2101 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2102 
2103 out:
2104 	st->last_pos = *pos;
2105 	return rc;
2106 }
2107 
2108 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2109 {
2110 	struct tcp_iter_state *st = seq->private;
2111 	void *rc = NULL;
2112 
2113 	if (v == SEQ_START_TOKEN) {
2114 		rc = tcp_get_idx(seq, 0);
2115 		goto out;
2116 	}
2117 
2118 	switch (st->state) {
2119 	case TCP_SEQ_STATE_OPENREQ:
2120 	case TCP_SEQ_STATE_LISTENING:
2121 		rc = listening_get_next(seq, v);
2122 		if (!rc) {
2123 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2124 			st->bucket = 0;
2125 			st->offset = 0;
2126 			rc	  = established_get_first(seq);
2127 		}
2128 		break;
2129 	case TCP_SEQ_STATE_ESTABLISHED:
2130 		rc = established_get_next(seq, v);
2131 		break;
2132 	}
2133 out:
2134 	++*pos;
2135 	st->last_pos = *pos;
2136 	return rc;
2137 }
2138 
2139 static void tcp_seq_stop(struct seq_file *seq, void *v)
2140 {
2141 	struct tcp_iter_state *st = seq->private;
2142 
2143 	switch (st->state) {
2144 	case TCP_SEQ_STATE_OPENREQ:
2145 		if (v) {
2146 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2147 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2148 		}
2149 	case TCP_SEQ_STATE_LISTENING:
2150 		if (v != SEQ_START_TOKEN)
2151 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2152 		break;
2153 	case TCP_SEQ_STATE_ESTABLISHED:
2154 		if (v)
2155 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2156 		break;
2157 	}
2158 }
2159 
2160 int tcp_seq_open(struct inode *inode, struct file *file)
2161 {
2162 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2163 	struct tcp_iter_state *s;
2164 	int err;
2165 
2166 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2167 			  sizeof(struct tcp_iter_state));
2168 	if (err < 0)
2169 		return err;
2170 
2171 	s = ((struct seq_file *)file->private_data)->private;
2172 	s->family		= afinfo->family;
2173 	s->last_pos		= 0;
2174 	return 0;
2175 }
2176 EXPORT_SYMBOL(tcp_seq_open);
2177 
2178 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2179 {
2180 	int rc = 0;
2181 	struct proc_dir_entry *p;
2182 
2183 	afinfo->seq_ops.start		= tcp_seq_start;
2184 	afinfo->seq_ops.next		= tcp_seq_next;
2185 	afinfo->seq_ops.stop		= tcp_seq_stop;
2186 
2187 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2188 			     afinfo->seq_fops, afinfo);
2189 	if (!p)
2190 		rc = -ENOMEM;
2191 	return rc;
2192 }
2193 EXPORT_SYMBOL(tcp_proc_register);
2194 
2195 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2196 {
2197 	remove_proc_entry(afinfo->name, net->proc_net);
2198 }
2199 EXPORT_SYMBOL(tcp_proc_unregister);
2200 
2201 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2202 			 struct seq_file *f, int i, kuid_t uid)
2203 {
2204 	const struct inet_request_sock *ireq = inet_rsk(req);
2205 	long delta = req->expires - jiffies;
2206 
2207 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2208 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2209 		i,
2210 		ireq->ir_loc_addr,
2211 		ntohs(inet_sk(sk)->inet_sport),
2212 		ireq->ir_rmt_addr,
2213 		ntohs(ireq->ir_rmt_port),
2214 		TCP_SYN_RECV,
2215 		0, 0, /* could print option size, but that is af dependent. */
2216 		1,    /* timers active (only the expire timer) */
2217 		jiffies_delta_to_clock_t(delta),
2218 		req->num_timeout,
2219 		from_kuid_munged(seq_user_ns(f), uid),
2220 		0,  /* non standard timer */
2221 		0, /* open_requests have no inode */
2222 		atomic_read(&sk->sk_refcnt),
2223 		req);
2224 }
2225 
2226 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2227 {
2228 	int timer_active;
2229 	unsigned long timer_expires;
2230 	const struct tcp_sock *tp = tcp_sk(sk);
2231 	const struct inet_connection_sock *icsk = inet_csk(sk);
2232 	const struct inet_sock *inet = inet_sk(sk);
2233 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2234 	__be32 dest = inet->inet_daddr;
2235 	__be32 src = inet->inet_rcv_saddr;
2236 	__u16 destp = ntohs(inet->inet_dport);
2237 	__u16 srcp = ntohs(inet->inet_sport);
2238 	int rx_queue;
2239 
2240 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2241 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2242 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2243 		timer_active	= 1;
2244 		timer_expires	= icsk->icsk_timeout;
2245 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2246 		timer_active	= 4;
2247 		timer_expires	= icsk->icsk_timeout;
2248 	} else if (timer_pending(&sk->sk_timer)) {
2249 		timer_active	= 2;
2250 		timer_expires	= sk->sk_timer.expires;
2251 	} else {
2252 		timer_active	= 0;
2253 		timer_expires = jiffies;
2254 	}
2255 
2256 	if (sk->sk_state == TCP_LISTEN)
2257 		rx_queue = sk->sk_ack_backlog;
2258 	else
2259 		/*
2260 		 * because we dont lock socket, we might find a transient negative value
2261 		 */
2262 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2263 
2264 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2265 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2266 		i, src, srcp, dest, destp, sk->sk_state,
2267 		tp->write_seq - tp->snd_una,
2268 		rx_queue,
2269 		timer_active,
2270 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2271 		icsk->icsk_retransmits,
2272 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2273 		icsk->icsk_probes_out,
2274 		sock_i_ino(sk),
2275 		atomic_read(&sk->sk_refcnt), sk,
2276 		jiffies_to_clock_t(icsk->icsk_rto),
2277 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2278 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2279 		tp->snd_cwnd,
2280 		sk->sk_state == TCP_LISTEN ?
2281 		    (fastopenq ? fastopenq->max_qlen : 0) :
2282 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2283 }
2284 
2285 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2286 			       struct seq_file *f, int i)
2287 {
2288 	__be32 dest, src;
2289 	__u16 destp, srcp;
2290 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2291 
2292 	dest  = tw->tw_daddr;
2293 	src   = tw->tw_rcv_saddr;
2294 	destp = ntohs(tw->tw_dport);
2295 	srcp  = ntohs(tw->tw_sport);
2296 
2297 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2298 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2299 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2300 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2301 		atomic_read(&tw->tw_refcnt), tw);
2302 }
2303 
2304 #define TMPSZ 150
2305 
2306 static int tcp4_seq_show(struct seq_file *seq, void *v)
2307 {
2308 	struct tcp_iter_state *st;
2309 	struct sock *sk = v;
2310 
2311 	seq_setwidth(seq, TMPSZ - 1);
2312 	if (v == SEQ_START_TOKEN) {
2313 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2314 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2315 			   "inode");
2316 		goto out;
2317 	}
2318 	st = seq->private;
2319 
2320 	switch (st->state) {
2321 	case TCP_SEQ_STATE_LISTENING:
2322 	case TCP_SEQ_STATE_ESTABLISHED:
2323 		if (sk->sk_state == TCP_TIME_WAIT)
2324 			get_timewait4_sock(v, seq, st->num);
2325 		else
2326 			get_tcp4_sock(v, seq, st->num);
2327 		break;
2328 	case TCP_SEQ_STATE_OPENREQ:
2329 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2330 		break;
2331 	}
2332 out:
2333 	seq_pad(seq, '\n');
2334 	return 0;
2335 }
2336 
2337 static const struct file_operations tcp_afinfo_seq_fops = {
2338 	.owner   = THIS_MODULE,
2339 	.open    = tcp_seq_open,
2340 	.read    = seq_read,
2341 	.llseek  = seq_lseek,
2342 	.release = seq_release_net
2343 };
2344 
2345 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2346 	.name		= "tcp",
2347 	.family		= AF_INET,
2348 	.seq_fops	= &tcp_afinfo_seq_fops,
2349 	.seq_ops	= {
2350 		.show		= tcp4_seq_show,
2351 	},
2352 };
2353 
2354 static int __net_init tcp4_proc_init_net(struct net *net)
2355 {
2356 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2357 }
2358 
2359 static void __net_exit tcp4_proc_exit_net(struct net *net)
2360 {
2361 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2362 }
2363 
2364 static struct pernet_operations tcp4_net_ops = {
2365 	.init = tcp4_proc_init_net,
2366 	.exit = tcp4_proc_exit_net,
2367 };
2368 
2369 int __init tcp4_proc_init(void)
2370 {
2371 	return register_pernet_subsys(&tcp4_net_ops);
2372 }
2373 
2374 void tcp4_proc_exit(void)
2375 {
2376 	unregister_pernet_subsys(&tcp4_net_ops);
2377 }
2378 #endif /* CONFIG_PROC_FS */
2379 
2380 struct proto tcp_prot = {
2381 	.name			= "TCP",
2382 	.owner			= THIS_MODULE,
2383 	.close			= tcp_close,
2384 	.connect		= tcp_v4_connect,
2385 	.disconnect		= tcp_disconnect,
2386 	.accept			= inet_csk_accept,
2387 	.ioctl			= tcp_ioctl,
2388 	.init			= tcp_v4_init_sock,
2389 	.destroy		= tcp_v4_destroy_sock,
2390 	.shutdown		= tcp_shutdown,
2391 	.setsockopt		= tcp_setsockopt,
2392 	.getsockopt		= tcp_getsockopt,
2393 	.recvmsg		= tcp_recvmsg,
2394 	.sendmsg		= tcp_sendmsg,
2395 	.sendpage		= tcp_sendpage,
2396 	.backlog_rcv		= tcp_v4_do_rcv,
2397 	.release_cb		= tcp_release_cb,
2398 	.hash			= inet_hash,
2399 	.unhash			= inet_unhash,
2400 	.get_port		= inet_csk_get_port,
2401 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2402 	.stream_memory_free	= tcp_stream_memory_free,
2403 	.sockets_allocated	= &tcp_sockets_allocated,
2404 	.orphan_count		= &tcp_orphan_count,
2405 	.memory_allocated	= &tcp_memory_allocated,
2406 	.memory_pressure	= &tcp_memory_pressure,
2407 	.sysctl_mem		= sysctl_tcp_mem,
2408 	.sysctl_wmem		= sysctl_tcp_wmem,
2409 	.sysctl_rmem		= sysctl_tcp_rmem,
2410 	.max_header		= MAX_TCP_HEADER,
2411 	.obj_size		= sizeof(struct tcp_sock),
2412 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2413 	.twsk_prot		= &tcp_timewait_sock_ops,
2414 	.rsk_prot		= &tcp_request_sock_ops,
2415 	.h.hashinfo		= &tcp_hashinfo,
2416 	.no_autobind		= true,
2417 #ifdef CONFIG_COMPAT
2418 	.compat_setsockopt	= compat_tcp_setsockopt,
2419 	.compat_getsockopt	= compat_tcp_getsockopt,
2420 #endif
2421 #ifdef CONFIG_MEMCG_KMEM
2422 	.init_cgroup		= tcp_init_cgroup,
2423 	.destroy_cgroup		= tcp_destroy_cgroup,
2424 	.proto_cgroup		= tcp_proto_cgroup,
2425 #endif
2426 };
2427 EXPORT_SYMBOL(tcp_prot);
2428 
2429 static int __net_init tcp_sk_init(struct net *net)
2430 {
2431 	net->ipv4.sysctl_tcp_ecn = 2;
2432 	return 0;
2433 }
2434 
2435 static void __net_exit tcp_sk_exit(struct net *net)
2436 {
2437 }
2438 
2439 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2440 {
2441 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2442 }
2443 
2444 static struct pernet_operations __net_initdata tcp_sk_ops = {
2445        .init	   = tcp_sk_init,
2446        .exit	   = tcp_sk_exit,
2447        .exit_batch = tcp_sk_exit_batch,
2448 };
2449 
2450 void __init tcp_v4_init(void)
2451 {
2452 	inet_hashinfo_init(&tcp_hashinfo);
2453 	if (register_pernet_subsys(&tcp_sk_ops))
2454 		panic("Failed to create the TCP control socket.\n");
2455 }
2456