xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 8684014d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99 
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 					  ip_hdr(skb)->saddr,
104 					  tcp_hdr(skb)->dest,
105 					  tcp_hdr(skb)->source);
106 }
107 
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	/* With PAWS, it is safe from the viewpoint
114 	   of data integrity. Even without PAWS it is safe provided sequence
115 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 
117 	   Actually, the idea is close to VJ's one, only timestamp cache is
118 	   held not per host, but per port pair and TW bucket is used as state
119 	   holder.
120 
121 	   If TW bucket has been already destroyed we fall back to VJ's scheme
122 	   and use initial timestamp retrieved from peer table.
123 	 */
124 	if (tcptw->tw_ts_recent_stamp &&
125 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
126 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 		if (tp->write_seq == 0)
129 			tp->write_seq = 1;
130 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 		sock_hold(sktw);
133 		return 1;
134 	}
135 
136 	return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139 
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 	struct inet_sock *inet = inet_sk(sk);
145 	struct tcp_sock *tp = tcp_sk(sk);
146 	__be16 orig_sport, orig_dport;
147 	__be32 daddr, nexthop;
148 	struct flowi4 *fl4;
149 	struct rtable *rt;
150 	int err;
151 	struct ip_options_rcu *inet_opt;
152 
153 	if (addr_len < sizeof(struct sockaddr_in))
154 		return -EINVAL;
155 
156 	if (usin->sin_family != AF_INET)
157 		return -EAFNOSUPPORT;
158 
159 	nexthop = daddr = usin->sin_addr.s_addr;
160 	inet_opt = rcu_dereference_protected(inet->inet_opt,
161 					     sock_owned_by_user(sk));
162 	if (inet_opt && inet_opt->opt.srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet_opt->opt.faddr;
166 	}
167 
168 	orig_sport = inet->inet_sport;
169 	orig_dport = usin->sin_port;
170 	fl4 = &inet->cork.fl.u.ip4;
171 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			      IPPROTO_TCP,
174 			      orig_sport, orig_dport, sk);
175 	if (IS_ERR(rt)) {
176 		err = PTR_ERR(rt);
177 		if (err == -ENETUNREACH)
178 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 		return err;
180 	}
181 
182 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 		ip_rt_put(rt);
184 		return -ENETUNREACH;
185 	}
186 
187 	if (!inet_opt || !inet_opt->opt.srr)
188 		daddr = fl4->daddr;
189 
190 	if (!inet->inet_saddr)
191 		inet->inet_saddr = fl4->saddr;
192 	inet->inet_rcv_saddr = inet->inet_saddr;
193 
194 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 		/* Reset inherited state */
196 		tp->rx_opt.ts_recent	   = 0;
197 		tp->rx_opt.ts_recent_stamp = 0;
198 		if (likely(!tp->repair))
199 			tp->write_seq	   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 		tcp_fetch_timewait_stamp(sk, &rt->dst);
205 
206 	inet->inet_dport = usin->sin_port;
207 	inet->inet_daddr = daddr;
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(&tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	inet_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304 
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 	struct dst_entry *dst = __sk_dst_check(sk, 0);
308 
309 	if (dst)
310 		dst->ops->redirect(dst, sk, skb);
311 }
312 
313 /*
314  * This routine is called by the ICMP module when it gets some
315  * sort of error condition.  If err < 0 then the socket should
316  * be closed and the error returned to the user.  If err > 0
317  * it's just the icmp type << 8 | icmp code.  After adjustment
318  * header points to the first 8 bytes of the tcp header.  We need
319  * to find the appropriate port.
320  *
321  * The locking strategy used here is very "optimistic". When
322  * someone else accesses the socket the ICMP is just dropped
323  * and for some paths there is no check at all.
324  * A more general error queue to queue errors for later handling
325  * is probably better.
326  *
327  */
328 
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 {
331 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333 	struct inet_connection_sock *icsk;
334 	struct tcp_sock *tp;
335 	struct inet_sock *inet;
336 	const int type = icmp_hdr(icmp_skb)->type;
337 	const int code = icmp_hdr(icmp_skb)->code;
338 	struct sock *sk;
339 	struct sk_buff *skb;
340 	struct request_sock *fastopen;
341 	__u32 seq, snd_una;
342 	__u32 remaining;
343 	int err;
344 	struct net *net = dev_net(icmp_skb->dev);
345 
346 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
347 			iph->saddr, th->source, inet_iif(icmp_skb));
348 	if (!sk) {
349 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 		return;
351 	}
352 	if (sk->sk_state == TCP_TIME_WAIT) {
353 		inet_twsk_put(inet_twsk(sk));
354 		return;
355 	}
356 
357 	bh_lock_sock(sk);
358 	/* If too many ICMPs get dropped on busy
359 	 * servers this needs to be solved differently.
360 	 * We do take care of PMTU discovery (RFC1191) special case :
361 	 * we can receive locally generated ICMP messages while socket is held.
362 	 */
363 	if (sock_owned_by_user(sk)) {
364 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 	}
367 	if (sk->sk_state == TCP_CLOSE)
368 		goto out;
369 
370 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 		goto out;
373 	}
374 
375 	icsk = inet_csk(sk);
376 	tp = tcp_sk(sk);
377 	seq = ntohl(th->seq);
378 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 	fastopen = tp->fastopen_rsk;
380 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
381 	if (sk->sk_state != TCP_LISTEN &&
382 	    !between(seq, snd_una, tp->snd_nxt)) {
383 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
384 		goto out;
385 	}
386 
387 	switch (type) {
388 	case ICMP_REDIRECT:
389 		do_redirect(icmp_skb, sk);
390 		goto out;
391 	case ICMP_SOURCE_QUENCH:
392 		/* Just silently ignore these. */
393 		goto out;
394 	case ICMP_PARAMETERPROB:
395 		err = EPROTO;
396 		break;
397 	case ICMP_DEST_UNREACH:
398 		if (code > NR_ICMP_UNREACH)
399 			goto out;
400 
401 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 			/* We are not interested in TCP_LISTEN and open_requests
403 			 * (SYN-ACKs send out by Linux are always <576bytes so
404 			 * they should go through unfragmented).
405 			 */
406 			if (sk->sk_state == TCP_LISTEN)
407 				goto out;
408 
409 			tp->mtu_info = info;
410 			if (!sock_owned_by_user(sk)) {
411 				tcp_v4_mtu_reduced(sk);
412 			} else {
413 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 					sock_hold(sk);
415 			}
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff || fastopen)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 					       TCP_TIMEOUT_INIT;
434 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto -
440 			    min(icsk->icsk_rto,
441 				tcp_time_stamp - tcp_skb_timestamp(skb));
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
489 		goto out;
490 
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && fastopen->sk == NULL)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 	hash_location = tcp_parse_md5sig_option(th);
629 	if (!sk && hash_location) {
630 		/*
631 		 * active side is lost. Try to find listening socket through
632 		 * source port, and then find md5 key through listening socket.
633 		 * we are not loose security here:
634 		 * Incoming packet is checked with md5 hash with finding key,
635 		 * no RST generated if md5 hash doesn't match.
636 		 */
637 		sk1 = __inet_lookup_listener(net,
638 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
639 					     th->source, ip_hdr(skb)->daddr,
640 					     ntohs(th->source), inet_iif(skb));
641 		/* don't send rst if it can't find key */
642 		if (!sk1)
643 			return;
644 		rcu_read_lock();
645 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
646 					&ip_hdr(skb)->saddr, AF_INET);
647 		if (!key)
648 			goto release_sk1;
649 
650 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
651 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
652 			goto release_sk1;
653 	} else {
654 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
655 					     &ip_hdr(skb)->saddr,
656 					     AF_INET) : NULL;
657 	}
658 
659 	if (key) {
660 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661 				   (TCPOPT_NOP << 16) |
662 				   (TCPOPT_MD5SIG << 8) |
663 				   TCPOLEN_MD5SIG);
664 		/* Update length and the length the header thinks exists */
665 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666 		rep.th.doff = arg.iov[0].iov_len / 4;
667 
668 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669 				     key, ip_hdr(skb)->saddr,
670 				     ip_hdr(skb)->daddr, &rep.th);
671 	}
672 #endif
673 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674 				      ip_hdr(skb)->saddr, /* XXX */
675 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
676 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678 	/* When socket is gone, all binding information is lost.
679 	 * routing might fail in this case. No choice here, if we choose to force
680 	 * input interface, we will misroute in case of asymmetric route.
681 	 */
682 	if (sk)
683 		arg.bound_dev_if = sk->sk_bound_dev_if;
684 
685 	arg.tos = ip_hdr(skb)->tos;
686 	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
687 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
688 			      &arg, arg.iov[0].iov_len);
689 
690 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
691 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
692 
693 #ifdef CONFIG_TCP_MD5SIG
694 release_sk1:
695 	if (sk1) {
696 		rcu_read_unlock();
697 		sock_put(sk1);
698 	}
699 #endif
700 }
701 
702 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
703    outside socket context is ugly, certainly. What can I do?
704  */
705 
706 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
707 			    u32 win, u32 tsval, u32 tsecr, int oif,
708 			    struct tcp_md5sig_key *key,
709 			    int reply_flags, u8 tos)
710 {
711 	const struct tcphdr *th = tcp_hdr(skb);
712 	struct {
713 		struct tcphdr th;
714 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
715 #ifdef CONFIG_TCP_MD5SIG
716 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
717 #endif
718 			];
719 	} rep;
720 	struct ip_reply_arg arg;
721 	struct net *net = dev_net(skb_dst(skb)->dev);
722 
723 	memset(&rep.th, 0, sizeof(struct tcphdr));
724 	memset(&arg, 0, sizeof(arg));
725 
726 	arg.iov[0].iov_base = (unsigned char *)&rep;
727 	arg.iov[0].iov_len  = sizeof(rep.th);
728 	if (tsecr) {
729 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
730 				   (TCPOPT_TIMESTAMP << 8) |
731 				   TCPOLEN_TIMESTAMP);
732 		rep.opt[1] = htonl(tsval);
733 		rep.opt[2] = htonl(tsecr);
734 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
735 	}
736 
737 	/* Swap the send and the receive. */
738 	rep.th.dest    = th->source;
739 	rep.th.source  = th->dest;
740 	rep.th.doff    = arg.iov[0].iov_len / 4;
741 	rep.th.seq     = htonl(seq);
742 	rep.th.ack_seq = htonl(ack);
743 	rep.th.ack     = 1;
744 	rep.th.window  = htons(win);
745 
746 #ifdef CONFIG_TCP_MD5SIG
747 	if (key) {
748 		int offset = (tsecr) ? 3 : 0;
749 
750 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
751 					  (TCPOPT_NOP << 16) |
752 					  (TCPOPT_MD5SIG << 8) |
753 					  TCPOLEN_MD5SIG);
754 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
755 		rep.th.doff = arg.iov[0].iov_len/4;
756 
757 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
758 				    key, ip_hdr(skb)->saddr,
759 				    ip_hdr(skb)->daddr, &rep.th);
760 	}
761 #endif
762 	arg.flags = reply_flags;
763 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
764 				      ip_hdr(skb)->saddr, /* XXX */
765 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
766 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
767 	if (oif)
768 		arg.bound_dev_if = oif;
769 	arg.tos = tos;
770 	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
771 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
772 			      &arg, arg.iov[0].iov_len);
773 
774 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
775 }
776 
777 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
778 {
779 	struct inet_timewait_sock *tw = inet_twsk(sk);
780 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
781 
782 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
783 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
784 			tcp_time_stamp + tcptw->tw_ts_offset,
785 			tcptw->tw_ts_recent,
786 			tw->tw_bound_dev_if,
787 			tcp_twsk_md5_key(tcptw),
788 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
789 			tw->tw_tos
790 			);
791 
792 	inet_twsk_put(tw);
793 }
794 
795 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
796 				  struct request_sock *req)
797 {
798 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
799 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
800 	 */
801 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
802 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
803 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
804 			tcp_time_stamp,
805 			req->ts_recent,
806 			0,
807 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
808 					  AF_INET),
809 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
810 			ip_hdr(skb)->tos);
811 }
812 
813 /*
814  *	Send a SYN-ACK after having received a SYN.
815  *	This still operates on a request_sock only, not on a big
816  *	socket.
817  */
818 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
819 			      struct flowi *fl,
820 			      struct request_sock *req,
821 			      u16 queue_mapping,
822 			      struct tcp_fastopen_cookie *foc)
823 {
824 	const struct inet_request_sock *ireq = inet_rsk(req);
825 	struct flowi4 fl4;
826 	int err = -1;
827 	struct sk_buff *skb;
828 
829 	/* First, grab a route. */
830 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
831 		return -1;
832 
833 	skb = tcp_make_synack(sk, dst, req, foc);
834 
835 	if (skb) {
836 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
837 
838 		skb_set_queue_mapping(skb, queue_mapping);
839 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
840 					    ireq->ir_rmt_addr,
841 					    ireq->opt);
842 		err = net_xmit_eval(err);
843 	}
844 
845 	return err;
846 }
847 
848 /*
849  *	IPv4 request_sock destructor.
850  */
851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
852 {
853 	kfree(inet_rsk(req)->opt);
854 }
855 
856 /*
857  * Return true if a syncookie should be sent
858  */
859 bool tcp_syn_flood_action(struct sock *sk,
860 			 const struct sk_buff *skb,
861 			 const char *proto)
862 {
863 	const char *msg = "Dropping request";
864 	bool want_cookie = false;
865 	struct listen_sock *lopt;
866 
867 #ifdef CONFIG_SYN_COOKIES
868 	if (sysctl_tcp_syncookies) {
869 		msg = "Sending cookies";
870 		want_cookie = true;
871 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
872 	} else
873 #endif
874 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
875 
876 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
877 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
878 		lopt->synflood_warned = 1;
879 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
880 			proto, ntohs(tcp_hdr(skb)->dest), msg);
881 	}
882 	return want_cookie;
883 }
884 EXPORT_SYMBOL(tcp_syn_flood_action);
885 
886 #ifdef CONFIG_TCP_MD5SIG
887 /*
888  * RFC2385 MD5 checksumming requires a mapping of
889  * IP address->MD5 Key.
890  * We need to maintain these in the sk structure.
891  */
892 
893 /* Find the Key structure for an address.  */
894 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
895 					 const union tcp_md5_addr *addr,
896 					 int family)
897 {
898 	struct tcp_sock *tp = tcp_sk(sk);
899 	struct tcp_md5sig_key *key;
900 	unsigned int size = sizeof(struct in_addr);
901 	struct tcp_md5sig_info *md5sig;
902 
903 	/* caller either holds rcu_read_lock() or socket lock */
904 	md5sig = rcu_dereference_check(tp->md5sig_info,
905 				       sock_owned_by_user(sk) ||
906 				       lockdep_is_held(&sk->sk_lock.slock));
907 	if (!md5sig)
908 		return NULL;
909 #if IS_ENABLED(CONFIG_IPV6)
910 	if (family == AF_INET6)
911 		size = sizeof(struct in6_addr);
912 #endif
913 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
914 		if (key->family != family)
915 			continue;
916 		if (!memcmp(&key->addr, addr, size))
917 			return key;
918 	}
919 	return NULL;
920 }
921 EXPORT_SYMBOL(tcp_md5_do_lookup);
922 
923 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
924 					 struct sock *addr_sk)
925 {
926 	union tcp_md5_addr *addr;
927 
928 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
929 	return tcp_md5_do_lookup(sk, addr, AF_INET);
930 }
931 EXPORT_SYMBOL(tcp_v4_md5_lookup);
932 
933 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
934 						      struct request_sock *req)
935 {
936 	union tcp_md5_addr *addr;
937 
938 	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
939 	return tcp_md5_do_lookup(sk, addr, AF_INET);
940 }
941 
942 /* This can be called on a newly created socket, from other files */
943 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
944 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
945 {
946 	/* Add Key to the list */
947 	struct tcp_md5sig_key *key;
948 	struct tcp_sock *tp = tcp_sk(sk);
949 	struct tcp_md5sig_info *md5sig;
950 
951 	key = tcp_md5_do_lookup(sk, addr, family);
952 	if (key) {
953 		/* Pre-existing entry - just update that one. */
954 		memcpy(key->key, newkey, newkeylen);
955 		key->keylen = newkeylen;
956 		return 0;
957 	}
958 
959 	md5sig = rcu_dereference_protected(tp->md5sig_info,
960 					   sock_owned_by_user(sk));
961 	if (!md5sig) {
962 		md5sig = kmalloc(sizeof(*md5sig), gfp);
963 		if (!md5sig)
964 			return -ENOMEM;
965 
966 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
967 		INIT_HLIST_HEAD(&md5sig->head);
968 		rcu_assign_pointer(tp->md5sig_info, md5sig);
969 	}
970 
971 	key = sock_kmalloc(sk, sizeof(*key), gfp);
972 	if (!key)
973 		return -ENOMEM;
974 	if (!tcp_alloc_md5sig_pool()) {
975 		sock_kfree_s(sk, key, sizeof(*key));
976 		return -ENOMEM;
977 	}
978 
979 	memcpy(key->key, newkey, newkeylen);
980 	key->keylen = newkeylen;
981 	key->family = family;
982 	memcpy(&key->addr, addr,
983 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
984 				      sizeof(struct in_addr));
985 	hlist_add_head_rcu(&key->node, &md5sig->head);
986 	return 0;
987 }
988 EXPORT_SYMBOL(tcp_md5_do_add);
989 
990 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
991 {
992 	struct tcp_md5sig_key *key;
993 
994 	key = tcp_md5_do_lookup(sk, addr, family);
995 	if (!key)
996 		return -ENOENT;
997 	hlist_del_rcu(&key->node);
998 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
999 	kfree_rcu(key, rcu);
1000 	return 0;
1001 }
1002 EXPORT_SYMBOL(tcp_md5_do_del);
1003 
1004 static void tcp_clear_md5_list(struct sock *sk)
1005 {
1006 	struct tcp_sock *tp = tcp_sk(sk);
1007 	struct tcp_md5sig_key *key;
1008 	struct hlist_node *n;
1009 	struct tcp_md5sig_info *md5sig;
1010 
1011 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1012 
1013 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1014 		hlist_del_rcu(&key->node);
1015 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1016 		kfree_rcu(key, rcu);
1017 	}
1018 }
1019 
1020 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1021 				 int optlen)
1022 {
1023 	struct tcp_md5sig cmd;
1024 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1025 
1026 	if (optlen < sizeof(cmd))
1027 		return -EINVAL;
1028 
1029 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1030 		return -EFAULT;
1031 
1032 	if (sin->sin_family != AF_INET)
1033 		return -EINVAL;
1034 
1035 	if (!cmd.tcpm_keylen)
1036 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1037 				      AF_INET);
1038 
1039 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1040 		return -EINVAL;
1041 
1042 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1043 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1044 			      GFP_KERNEL);
1045 }
1046 
1047 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1048 					__be32 daddr, __be32 saddr, int nbytes)
1049 {
1050 	struct tcp4_pseudohdr *bp;
1051 	struct scatterlist sg;
1052 
1053 	bp = &hp->md5_blk.ip4;
1054 
1055 	/*
1056 	 * 1. the TCP pseudo-header (in the order: source IP address,
1057 	 * destination IP address, zero-padded protocol number, and
1058 	 * segment length)
1059 	 */
1060 	bp->saddr = saddr;
1061 	bp->daddr = daddr;
1062 	bp->pad = 0;
1063 	bp->protocol = IPPROTO_TCP;
1064 	bp->len = cpu_to_be16(nbytes);
1065 
1066 	sg_init_one(&sg, bp, sizeof(*bp));
1067 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1068 }
1069 
1070 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1071 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1072 {
1073 	struct tcp_md5sig_pool *hp;
1074 	struct hash_desc *desc;
1075 
1076 	hp = tcp_get_md5sig_pool();
1077 	if (!hp)
1078 		goto clear_hash_noput;
1079 	desc = &hp->md5_desc;
1080 
1081 	if (crypto_hash_init(desc))
1082 		goto clear_hash;
1083 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1084 		goto clear_hash;
1085 	if (tcp_md5_hash_header(hp, th))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_key(hp, key))
1088 		goto clear_hash;
1089 	if (crypto_hash_final(desc, md5_hash))
1090 		goto clear_hash;
1091 
1092 	tcp_put_md5sig_pool();
1093 	return 0;
1094 
1095 clear_hash:
1096 	tcp_put_md5sig_pool();
1097 clear_hash_noput:
1098 	memset(md5_hash, 0, 16);
1099 	return 1;
1100 }
1101 
1102 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1103 			const struct sock *sk, const struct request_sock *req,
1104 			const struct sk_buff *skb)
1105 {
1106 	struct tcp_md5sig_pool *hp;
1107 	struct hash_desc *desc;
1108 	const struct tcphdr *th = tcp_hdr(skb);
1109 	__be32 saddr, daddr;
1110 
1111 	if (sk) {
1112 		saddr = inet_sk(sk)->inet_saddr;
1113 		daddr = inet_sk(sk)->inet_daddr;
1114 	} else if (req) {
1115 		saddr = inet_rsk(req)->ir_loc_addr;
1116 		daddr = inet_rsk(req)->ir_rmt_addr;
1117 	} else {
1118 		const struct iphdr *iph = ip_hdr(skb);
1119 		saddr = iph->saddr;
1120 		daddr = iph->daddr;
1121 	}
1122 
1123 	hp = tcp_get_md5sig_pool();
1124 	if (!hp)
1125 		goto clear_hash_noput;
1126 	desc = &hp->md5_desc;
1127 
1128 	if (crypto_hash_init(desc))
1129 		goto clear_hash;
1130 
1131 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1132 		goto clear_hash;
1133 	if (tcp_md5_hash_header(hp, th))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_key(hp, key))
1138 		goto clear_hash;
1139 	if (crypto_hash_final(desc, md5_hash))
1140 		goto clear_hash;
1141 
1142 	tcp_put_md5sig_pool();
1143 	return 0;
1144 
1145 clear_hash:
1146 	tcp_put_md5sig_pool();
1147 clear_hash_noput:
1148 	memset(md5_hash, 0, 16);
1149 	return 1;
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1152 
1153 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1154 				      const struct sk_buff *skb)
1155 {
1156 	/*
1157 	 * This gets called for each TCP segment that arrives
1158 	 * so we want to be efficient.
1159 	 * We have 3 drop cases:
1160 	 * o No MD5 hash and one expected.
1161 	 * o MD5 hash and we're not expecting one.
1162 	 * o MD5 hash and its wrong.
1163 	 */
1164 	const __u8 *hash_location = NULL;
1165 	struct tcp_md5sig_key *hash_expected;
1166 	const struct iphdr *iph = ip_hdr(skb);
1167 	const struct tcphdr *th = tcp_hdr(skb);
1168 	int genhash;
1169 	unsigned char newhash[16];
1170 
1171 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1172 					  AF_INET);
1173 	hash_location = tcp_parse_md5sig_option(th);
1174 
1175 	/* We've parsed the options - do we have a hash? */
1176 	if (!hash_expected && !hash_location)
1177 		return false;
1178 
1179 	if (hash_expected && !hash_location) {
1180 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1181 		return true;
1182 	}
1183 
1184 	if (!hash_expected && hash_location) {
1185 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1186 		return true;
1187 	}
1188 
1189 	/* Okay, so this is hash_expected and hash_location -
1190 	 * so we need to calculate the checksum.
1191 	 */
1192 	genhash = tcp_v4_md5_hash_skb(newhash,
1193 				      hash_expected,
1194 				      NULL, NULL, skb);
1195 
1196 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1197 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1198 				     &iph->saddr, ntohs(th->source),
1199 				     &iph->daddr, ntohs(th->dest),
1200 				     genhash ? " tcp_v4_calc_md5_hash failed"
1201 				     : "");
1202 		return true;
1203 	}
1204 	return false;
1205 }
1206 
1207 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1208 {
1209 	bool ret;
1210 
1211 	rcu_read_lock();
1212 	ret = __tcp_v4_inbound_md5_hash(sk, skb);
1213 	rcu_read_unlock();
1214 
1215 	return ret;
1216 }
1217 
1218 #endif
1219 
1220 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1221 			    struct sk_buff *skb)
1222 {
1223 	struct inet_request_sock *ireq = inet_rsk(req);
1224 
1225 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1226 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1227 	ireq->no_srccheck = inet_sk(sk)->transparent;
1228 	ireq->opt = tcp_v4_save_options(skb);
1229 }
1230 
1231 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1232 					  const struct request_sock *req,
1233 					  bool *strict)
1234 {
1235 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1236 
1237 	if (strict) {
1238 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1239 			*strict = true;
1240 		else
1241 			*strict = false;
1242 	}
1243 
1244 	return dst;
1245 }
1246 
1247 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1248 	.family		=	PF_INET,
1249 	.obj_size	=	sizeof(struct tcp_request_sock),
1250 	.rtx_syn_ack	=	tcp_rtx_synack,
1251 	.send_ack	=	tcp_v4_reqsk_send_ack,
1252 	.destructor	=	tcp_v4_reqsk_destructor,
1253 	.send_reset	=	tcp_v4_send_reset,
1254 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1255 };
1256 
1257 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1258 	.mss_clamp	=	TCP_MSS_DEFAULT,
1259 #ifdef CONFIG_TCP_MD5SIG
1260 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1261 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1262 #endif
1263 	.init_req	=	tcp_v4_init_req,
1264 #ifdef CONFIG_SYN_COOKIES
1265 	.cookie_init_seq =	cookie_v4_init_sequence,
1266 #endif
1267 	.route_req	=	tcp_v4_route_req,
1268 	.init_seq	=	tcp_v4_init_sequence,
1269 	.send_synack	=	tcp_v4_send_synack,
1270 	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,
1271 };
1272 
1273 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1274 {
1275 	/* Never answer to SYNs send to broadcast or multicast */
1276 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1277 		goto drop;
1278 
1279 	return tcp_conn_request(&tcp_request_sock_ops,
1280 				&tcp_request_sock_ipv4_ops, sk, skb);
1281 
1282 drop:
1283 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1284 	return 0;
1285 }
1286 EXPORT_SYMBOL(tcp_v4_conn_request);
1287 
1288 
1289 /*
1290  * The three way handshake has completed - we got a valid synack -
1291  * now create the new socket.
1292  */
1293 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1294 				  struct request_sock *req,
1295 				  struct dst_entry *dst)
1296 {
1297 	struct inet_request_sock *ireq;
1298 	struct inet_sock *newinet;
1299 	struct tcp_sock *newtp;
1300 	struct sock *newsk;
1301 #ifdef CONFIG_TCP_MD5SIG
1302 	struct tcp_md5sig_key *key;
1303 #endif
1304 	struct ip_options_rcu *inet_opt;
1305 
1306 	if (sk_acceptq_is_full(sk))
1307 		goto exit_overflow;
1308 
1309 	newsk = tcp_create_openreq_child(sk, req, skb);
1310 	if (!newsk)
1311 		goto exit_nonewsk;
1312 
1313 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1314 	inet_sk_rx_dst_set(newsk, skb);
1315 
1316 	newtp		      = tcp_sk(newsk);
1317 	newinet		      = inet_sk(newsk);
1318 	ireq		      = inet_rsk(req);
1319 	newinet->inet_daddr   = ireq->ir_rmt_addr;
1320 	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1321 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1322 	inet_opt	      = ireq->opt;
1323 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1324 	ireq->opt	      = NULL;
1325 	newinet->mc_index     = inet_iif(skb);
1326 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1327 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1328 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1329 	inet_set_txhash(newsk);
1330 	if (inet_opt)
1331 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1332 	newinet->inet_id = newtp->write_seq ^ jiffies;
1333 
1334 	if (!dst) {
1335 		dst = inet_csk_route_child_sock(sk, newsk, req);
1336 		if (!dst)
1337 			goto put_and_exit;
1338 	} else {
1339 		/* syncookie case : see end of cookie_v4_check() */
1340 	}
1341 	sk_setup_caps(newsk, dst);
1342 
1343 	tcp_sync_mss(newsk, dst_mtu(dst));
1344 	newtp->advmss = dst_metric_advmss(dst);
1345 	if (tcp_sk(sk)->rx_opt.user_mss &&
1346 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1347 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1348 
1349 	tcp_initialize_rcv_mss(newsk);
1350 
1351 #ifdef CONFIG_TCP_MD5SIG
1352 	/* Copy over the MD5 key from the original socket */
1353 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1354 				AF_INET);
1355 	if (key != NULL) {
1356 		/*
1357 		 * We're using one, so create a matching key
1358 		 * on the newsk structure. If we fail to get
1359 		 * memory, then we end up not copying the key
1360 		 * across. Shucks.
1361 		 */
1362 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1363 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1364 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1365 	}
1366 #endif
1367 
1368 	if (__inet_inherit_port(sk, newsk) < 0)
1369 		goto put_and_exit;
1370 	__inet_hash_nolisten(newsk, NULL);
1371 
1372 	return newsk;
1373 
1374 exit_overflow:
1375 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1376 exit_nonewsk:
1377 	dst_release(dst);
1378 exit:
1379 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1380 	return NULL;
1381 put_and_exit:
1382 	inet_csk_prepare_forced_close(newsk);
1383 	tcp_done(newsk);
1384 	goto exit;
1385 }
1386 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1387 
1388 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1389 {
1390 	struct tcphdr *th = tcp_hdr(skb);
1391 	const struct iphdr *iph = ip_hdr(skb);
1392 	struct sock *nsk;
1393 	struct request_sock **prev;
1394 	/* Find possible connection requests. */
1395 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1396 						       iph->saddr, iph->daddr);
1397 	if (req)
1398 		return tcp_check_req(sk, skb, req, prev, false);
1399 
1400 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1401 			th->source, iph->daddr, th->dest, inet_iif(skb));
1402 
1403 	if (nsk) {
1404 		if (nsk->sk_state != TCP_TIME_WAIT) {
1405 			bh_lock_sock(nsk);
1406 			return nsk;
1407 		}
1408 		inet_twsk_put(inet_twsk(nsk));
1409 		return NULL;
1410 	}
1411 
1412 #ifdef CONFIG_SYN_COOKIES
1413 	if (!th->syn)
1414 		sk = cookie_v4_check(sk, skb);
1415 #endif
1416 	return sk;
1417 }
1418 
1419 /* The socket must have it's spinlock held when we get
1420  * here.
1421  *
1422  * We have a potential double-lock case here, so even when
1423  * doing backlog processing we use the BH locking scheme.
1424  * This is because we cannot sleep with the original spinlock
1425  * held.
1426  */
1427 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1428 {
1429 	struct sock *rsk;
1430 
1431 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1432 		struct dst_entry *dst = sk->sk_rx_dst;
1433 
1434 		sock_rps_save_rxhash(sk, skb);
1435 		sk_mark_napi_id(sk, skb);
1436 		if (dst) {
1437 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1438 			    dst->ops->check(dst, 0) == NULL) {
1439 				dst_release(dst);
1440 				sk->sk_rx_dst = NULL;
1441 			}
1442 		}
1443 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1444 		return 0;
1445 	}
1446 
1447 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1448 		goto csum_err;
1449 
1450 	if (sk->sk_state == TCP_LISTEN) {
1451 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1452 		if (!nsk)
1453 			goto discard;
1454 
1455 		if (nsk != sk) {
1456 			sock_rps_save_rxhash(nsk, skb);
1457 			sk_mark_napi_id(sk, skb);
1458 			if (tcp_child_process(sk, nsk, skb)) {
1459 				rsk = nsk;
1460 				goto reset;
1461 			}
1462 			return 0;
1463 		}
1464 	} else
1465 		sock_rps_save_rxhash(sk, skb);
1466 
1467 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1468 		rsk = sk;
1469 		goto reset;
1470 	}
1471 	return 0;
1472 
1473 reset:
1474 	tcp_v4_send_reset(rsk, skb);
1475 discard:
1476 	kfree_skb(skb);
1477 	/* Be careful here. If this function gets more complicated and
1478 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1479 	 * might be destroyed here. This current version compiles correctly,
1480 	 * but you have been warned.
1481 	 */
1482 	return 0;
1483 
1484 csum_err:
1485 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1486 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1487 	goto discard;
1488 }
1489 EXPORT_SYMBOL(tcp_v4_do_rcv);
1490 
1491 void tcp_v4_early_demux(struct sk_buff *skb)
1492 {
1493 	const struct iphdr *iph;
1494 	const struct tcphdr *th;
1495 	struct sock *sk;
1496 
1497 	if (skb->pkt_type != PACKET_HOST)
1498 		return;
1499 
1500 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1501 		return;
1502 
1503 	iph = ip_hdr(skb);
1504 	th = tcp_hdr(skb);
1505 
1506 	if (th->doff < sizeof(struct tcphdr) / 4)
1507 		return;
1508 
1509 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1510 				       iph->saddr, th->source,
1511 				       iph->daddr, ntohs(th->dest),
1512 				       skb->skb_iif);
1513 	if (sk) {
1514 		skb->sk = sk;
1515 		skb->destructor = sock_edemux;
1516 		if (sk->sk_state != TCP_TIME_WAIT) {
1517 			struct dst_entry *dst = sk->sk_rx_dst;
1518 
1519 			if (dst)
1520 				dst = dst_check(dst, 0);
1521 			if (dst &&
1522 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1523 				skb_dst_set_noref(skb, dst);
1524 		}
1525 	}
1526 }
1527 
1528 /* Packet is added to VJ-style prequeue for processing in process
1529  * context, if a reader task is waiting. Apparently, this exciting
1530  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1531  * failed somewhere. Latency? Burstiness? Well, at least now we will
1532  * see, why it failed. 8)8)				  --ANK
1533  *
1534  */
1535 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1536 {
1537 	struct tcp_sock *tp = tcp_sk(sk);
1538 
1539 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1540 		return false;
1541 
1542 	if (skb->len <= tcp_hdrlen(skb) &&
1543 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1544 		return false;
1545 
1546 	/* Before escaping RCU protected region, we need to take care of skb
1547 	 * dst. Prequeue is only enabled for established sockets.
1548 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1549 	 * Instead of doing full sk_rx_dst validity here, let's perform
1550 	 * an optimistic check.
1551 	 */
1552 	if (likely(sk->sk_rx_dst))
1553 		skb_dst_drop(skb);
1554 	else
1555 		skb_dst_force(skb);
1556 
1557 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1558 	tp->ucopy.memory += skb->truesize;
1559 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1560 		struct sk_buff *skb1;
1561 
1562 		BUG_ON(sock_owned_by_user(sk));
1563 
1564 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1565 			sk_backlog_rcv(sk, skb1);
1566 			NET_INC_STATS_BH(sock_net(sk),
1567 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1568 		}
1569 
1570 		tp->ucopy.memory = 0;
1571 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1572 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1573 					   POLLIN | POLLRDNORM | POLLRDBAND);
1574 		if (!inet_csk_ack_scheduled(sk))
1575 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1576 						  (3 * tcp_rto_min(sk)) / 4,
1577 						  TCP_RTO_MAX);
1578 	}
1579 	return true;
1580 }
1581 EXPORT_SYMBOL(tcp_prequeue);
1582 
1583 /*
1584  *	From tcp_input.c
1585  */
1586 
1587 int tcp_v4_rcv(struct sk_buff *skb)
1588 {
1589 	const struct iphdr *iph;
1590 	const struct tcphdr *th;
1591 	struct sock *sk;
1592 	int ret;
1593 	struct net *net = dev_net(skb->dev);
1594 
1595 	if (skb->pkt_type != PACKET_HOST)
1596 		goto discard_it;
1597 
1598 	/* Count it even if it's bad */
1599 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1600 
1601 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1602 		goto discard_it;
1603 
1604 	th = tcp_hdr(skb);
1605 
1606 	if (th->doff < sizeof(struct tcphdr) / 4)
1607 		goto bad_packet;
1608 	if (!pskb_may_pull(skb, th->doff * 4))
1609 		goto discard_it;
1610 
1611 	/* An explanation is required here, I think.
1612 	 * Packet length and doff are validated by header prediction,
1613 	 * provided case of th->doff==0 is eliminated.
1614 	 * So, we defer the checks. */
1615 
1616 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1617 		goto csum_error;
1618 
1619 	th = tcp_hdr(skb);
1620 	iph = ip_hdr(skb);
1621 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1622 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1623 	 */
1624 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1625 		sizeof(struct inet_skb_parm));
1626 	barrier();
1627 
1628 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1629 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1630 				    skb->len - th->doff * 4);
1631 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1632 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1633 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1634 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1635 	TCP_SKB_CB(skb)->sacked	 = 0;
1636 
1637 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1638 	if (!sk)
1639 		goto no_tcp_socket;
1640 
1641 process:
1642 	if (sk->sk_state == TCP_TIME_WAIT)
1643 		goto do_time_wait;
1644 
1645 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1646 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1647 		goto discard_and_relse;
1648 	}
1649 
1650 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1651 		goto discard_and_relse;
1652 
1653 #ifdef CONFIG_TCP_MD5SIG
1654 	/*
1655 	 * We really want to reject the packet as early as possible
1656 	 * if:
1657 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1658 	 *  o There is an MD5 option and we're not expecting one
1659 	 */
1660 	if (tcp_v4_inbound_md5_hash(sk, skb))
1661 		goto discard_and_relse;
1662 #endif
1663 
1664 	nf_reset(skb);
1665 
1666 	if (sk_filter(sk, skb))
1667 		goto discard_and_relse;
1668 
1669 	sk_incoming_cpu_update(sk);
1670 	skb->dev = NULL;
1671 
1672 	bh_lock_sock_nested(sk);
1673 	ret = 0;
1674 	if (!sock_owned_by_user(sk)) {
1675 		if (!tcp_prequeue(sk, skb))
1676 			ret = tcp_v4_do_rcv(sk, skb);
1677 	} else if (unlikely(sk_add_backlog(sk, skb,
1678 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1679 		bh_unlock_sock(sk);
1680 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1681 		goto discard_and_relse;
1682 	}
1683 	bh_unlock_sock(sk);
1684 
1685 	sock_put(sk);
1686 
1687 	return ret;
1688 
1689 no_tcp_socket:
1690 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1691 		goto discard_it;
1692 
1693 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1694 csum_error:
1695 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1696 bad_packet:
1697 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1698 	} else {
1699 		tcp_v4_send_reset(NULL, skb);
1700 	}
1701 
1702 discard_it:
1703 	/* Discard frame. */
1704 	kfree_skb(skb);
1705 	return 0;
1706 
1707 discard_and_relse:
1708 	sock_put(sk);
1709 	goto discard_it;
1710 
1711 do_time_wait:
1712 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1713 		inet_twsk_put(inet_twsk(sk));
1714 		goto discard_it;
1715 	}
1716 
1717 	if (skb->len < (th->doff << 2)) {
1718 		inet_twsk_put(inet_twsk(sk));
1719 		goto bad_packet;
1720 	}
1721 	if (tcp_checksum_complete(skb)) {
1722 		inet_twsk_put(inet_twsk(sk));
1723 		goto csum_error;
1724 	}
1725 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1726 	case TCP_TW_SYN: {
1727 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1728 							&tcp_hashinfo,
1729 							iph->saddr, th->source,
1730 							iph->daddr, th->dest,
1731 							inet_iif(skb));
1732 		if (sk2) {
1733 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1734 			inet_twsk_put(inet_twsk(sk));
1735 			sk = sk2;
1736 			goto process;
1737 		}
1738 		/* Fall through to ACK */
1739 	}
1740 	case TCP_TW_ACK:
1741 		tcp_v4_timewait_ack(sk, skb);
1742 		break;
1743 	case TCP_TW_RST:
1744 		goto no_tcp_socket;
1745 	case TCP_TW_SUCCESS:;
1746 	}
1747 	goto discard_it;
1748 }
1749 
1750 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1751 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1752 	.twsk_unique	= tcp_twsk_unique,
1753 	.twsk_destructor= tcp_twsk_destructor,
1754 };
1755 
1756 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1757 {
1758 	struct dst_entry *dst = skb_dst(skb);
1759 
1760 	if (dst) {
1761 		dst_hold(dst);
1762 		sk->sk_rx_dst = dst;
1763 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1764 	}
1765 }
1766 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1767 
1768 const struct inet_connection_sock_af_ops ipv4_specific = {
1769 	.queue_xmit	   = ip_queue_xmit,
1770 	.send_check	   = tcp_v4_send_check,
1771 	.rebuild_header	   = inet_sk_rebuild_header,
1772 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1773 	.conn_request	   = tcp_v4_conn_request,
1774 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1775 	.net_header_len	   = sizeof(struct iphdr),
1776 	.setsockopt	   = ip_setsockopt,
1777 	.getsockopt	   = ip_getsockopt,
1778 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1779 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1780 	.bind_conflict	   = inet_csk_bind_conflict,
1781 #ifdef CONFIG_COMPAT
1782 	.compat_setsockopt = compat_ip_setsockopt,
1783 	.compat_getsockopt = compat_ip_getsockopt,
1784 #endif
1785 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1786 };
1787 EXPORT_SYMBOL(ipv4_specific);
1788 
1789 #ifdef CONFIG_TCP_MD5SIG
1790 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1791 	.md5_lookup		= tcp_v4_md5_lookup,
1792 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1793 	.md5_parse		= tcp_v4_parse_md5_keys,
1794 };
1795 #endif
1796 
1797 /* NOTE: A lot of things set to zero explicitly by call to
1798  *       sk_alloc() so need not be done here.
1799  */
1800 static int tcp_v4_init_sock(struct sock *sk)
1801 {
1802 	struct inet_connection_sock *icsk = inet_csk(sk);
1803 
1804 	tcp_init_sock(sk);
1805 
1806 	icsk->icsk_af_ops = &ipv4_specific;
1807 
1808 #ifdef CONFIG_TCP_MD5SIG
1809 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1810 #endif
1811 
1812 	return 0;
1813 }
1814 
1815 void tcp_v4_destroy_sock(struct sock *sk)
1816 {
1817 	struct tcp_sock *tp = tcp_sk(sk);
1818 
1819 	tcp_clear_xmit_timers(sk);
1820 
1821 	tcp_cleanup_congestion_control(sk);
1822 
1823 	/* Cleanup up the write buffer. */
1824 	tcp_write_queue_purge(sk);
1825 
1826 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1827 	__skb_queue_purge(&tp->out_of_order_queue);
1828 
1829 #ifdef CONFIG_TCP_MD5SIG
1830 	/* Clean up the MD5 key list, if any */
1831 	if (tp->md5sig_info) {
1832 		tcp_clear_md5_list(sk);
1833 		kfree_rcu(tp->md5sig_info, rcu);
1834 		tp->md5sig_info = NULL;
1835 	}
1836 #endif
1837 
1838 	/* Clean prequeue, it must be empty really */
1839 	__skb_queue_purge(&tp->ucopy.prequeue);
1840 
1841 	/* Clean up a referenced TCP bind bucket. */
1842 	if (inet_csk(sk)->icsk_bind_hash)
1843 		inet_put_port(sk);
1844 
1845 	BUG_ON(tp->fastopen_rsk != NULL);
1846 
1847 	/* If socket is aborted during connect operation */
1848 	tcp_free_fastopen_req(tp);
1849 
1850 	sk_sockets_allocated_dec(sk);
1851 	sock_release_memcg(sk);
1852 }
1853 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1854 
1855 #ifdef CONFIG_PROC_FS
1856 /* Proc filesystem TCP sock list dumping. */
1857 
1858 /*
1859  * Get next listener socket follow cur.  If cur is NULL, get first socket
1860  * starting from bucket given in st->bucket; when st->bucket is zero the
1861  * very first socket in the hash table is returned.
1862  */
1863 static void *listening_get_next(struct seq_file *seq, void *cur)
1864 {
1865 	struct inet_connection_sock *icsk;
1866 	struct hlist_nulls_node *node;
1867 	struct sock *sk = cur;
1868 	struct inet_listen_hashbucket *ilb;
1869 	struct tcp_iter_state *st = seq->private;
1870 	struct net *net = seq_file_net(seq);
1871 
1872 	if (!sk) {
1873 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1874 		spin_lock_bh(&ilb->lock);
1875 		sk = sk_nulls_head(&ilb->head);
1876 		st->offset = 0;
1877 		goto get_sk;
1878 	}
1879 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1880 	++st->num;
1881 	++st->offset;
1882 
1883 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1884 		struct request_sock *req = cur;
1885 
1886 		icsk = inet_csk(st->syn_wait_sk);
1887 		req = req->dl_next;
1888 		while (1) {
1889 			while (req) {
1890 				if (req->rsk_ops->family == st->family) {
1891 					cur = req;
1892 					goto out;
1893 				}
1894 				req = req->dl_next;
1895 			}
1896 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1897 				break;
1898 get_req:
1899 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1900 		}
1901 		sk	  = sk_nulls_next(st->syn_wait_sk);
1902 		st->state = TCP_SEQ_STATE_LISTENING;
1903 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1904 	} else {
1905 		icsk = inet_csk(sk);
1906 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1908 			goto start_req;
1909 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910 		sk = sk_nulls_next(sk);
1911 	}
1912 get_sk:
1913 	sk_nulls_for_each_from(sk, node) {
1914 		if (!net_eq(sock_net(sk), net))
1915 			continue;
1916 		if (sk->sk_family == st->family) {
1917 			cur = sk;
1918 			goto out;
1919 		}
1920 		icsk = inet_csk(sk);
1921 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1922 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1923 start_req:
1924 			st->uid		= sock_i_uid(sk);
1925 			st->syn_wait_sk = sk;
1926 			st->state	= TCP_SEQ_STATE_OPENREQ;
1927 			st->sbucket	= 0;
1928 			goto get_req;
1929 		}
1930 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1931 	}
1932 	spin_unlock_bh(&ilb->lock);
1933 	st->offset = 0;
1934 	if (++st->bucket < INET_LHTABLE_SIZE) {
1935 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1936 		spin_lock_bh(&ilb->lock);
1937 		sk = sk_nulls_head(&ilb->head);
1938 		goto get_sk;
1939 	}
1940 	cur = NULL;
1941 out:
1942 	return cur;
1943 }
1944 
1945 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946 {
1947 	struct tcp_iter_state *st = seq->private;
1948 	void *rc;
1949 
1950 	st->bucket = 0;
1951 	st->offset = 0;
1952 	rc = listening_get_next(seq, NULL);
1953 
1954 	while (rc && *pos) {
1955 		rc = listening_get_next(seq, rc);
1956 		--*pos;
1957 	}
1958 	return rc;
1959 }
1960 
1961 static inline bool empty_bucket(const struct tcp_iter_state *st)
1962 {
1963 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1964 }
1965 
1966 /*
1967  * Get first established socket starting from bucket given in st->bucket.
1968  * If st->bucket is zero, the very first socket in the hash is returned.
1969  */
1970 static void *established_get_first(struct seq_file *seq)
1971 {
1972 	struct tcp_iter_state *st = seq->private;
1973 	struct net *net = seq_file_net(seq);
1974 	void *rc = NULL;
1975 
1976 	st->offset = 0;
1977 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1978 		struct sock *sk;
1979 		struct hlist_nulls_node *node;
1980 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1981 
1982 		/* Lockless fast path for the common case of empty buckets */
1983 		if (empty_bucket(st))
1984 			continue;
1985 
1986 		spin_lock_bh(lock);
1987 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1988 			if (sk->sk_family != st->family ||
1989 			    !net_eq(sock_net(sk), net)) {
1990 				continue;
1991 			}
1992 			rc = sk;
1993 			goto out;
1994 		}
1995 		spin_unlock_bh(lock);
1996 	}
1997 out:
1998 	return rc;
1999 }
2000 
2001 static void *established_get_next(struct seq_file *seq, void *cur)
2002 {
2003 	struct sock *sk = cur;
2004 	struct hlist_nulls_node *node;
2005 	struct tcp_iter_state *st = seq->private;
2006 	struct net *net = seq_file_net(seq);
2007 
2008 	++st->num;
2009 	++st->offset;
2010 
2011 	sk = sk_nulls_next(sk);
2012 
2013 	sk_nulls_for_each_from(sk, node) {
2014 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2015 			return sk;
2016 	}
2017 
2018 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2019 	++st->bucket;
2020 	return established_get_first(seq);
2021 }
2022 
2023 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2024 {
2025 	struct tcp_iter_state *st = seq->private;
2026 	void *rc;
2027 
2028 	st->bucket = 0;
2029 	rc = established_get_first(seq);
2030 
2031 	while (rc && pos) {
2032 		rc = established_get_next(seq, rc);
2033 		--pos;
2034 	}
2035 	return rc;
2036 }
2037 
2038 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2039 {
2040 	void *rc;
2041 	struct tcp_iter_state *st = seq->private;
2042 
2043 	st->state = TCP_SEQ_STATE_LISTENING;
2044 	rc	  = listening_get_idx(seq, &pos);
2045 
2046 	if (!rc) {
2047 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2048 		rc	  = established_get_idx(seq, pos);
2049 	}
2050 
2051 	return rc;
2052 }
2053 
2054 static void *tcp_seek_last_pos(struct seq_file *seq)
2055 {
2056 	struct tcp_iter_state *st = seq->private;
2057 	int offset = st->offset;
2058 	int orig_num = st->num;
2059 	void *rc = NULL;
2060 
2061 	switch (st->state) {
2062 	case TCP_SEQ_STATE_OPENREQ:
2063 	case TCP_SEQ_STATE_LISTENING:
2064 		if (st->bucket >= INET_LHTABLE_SIZE)
2065 			break;
2066 		st->state = TCP_SEQ_STATE_LISTENING;
2067 		rc = listening_get_next(seq, NULL);
2068 		while (offset-- && rc)
2069 			rc = listening_get_next(seq, rc);
2070 		if (rc)
2071 			break;
2072 		st->bucket = 0;
2073 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2074 		/* Fallthrough */
2075 	case TCP_SEQ_STATE_ESTABLISHED:
2076 		if (st->bucket > tcp_hashinfo.ehash_mask)
2077 			break;
2078 		rc = established_get_first(seq);
2079 		while (offset-- && rc)
2080 			rc = established_get_next(seq, rc);
2081 	}
2082 
2083 	st->num = orig_num;
2084 
2085 	return rc;
2086 }
2087 
2088 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2089 {
2090 	struct tcp_iter_state *st = seq->private;
2091 	void *rc;
2092 
2093 	if (*pos && *pos == st->last_pos) {
2094 		rc = tcp_seek_last_pos(seq);
2095 		if (rc)
2096 			goto out;
2097 	}
2098 
2099 	st->state = TCP_SEQ_STATE_LISTENING;
2100 	st->num = 0;
2101 	st->bucket = 0;
2102 	st->offset = 0;
2103 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2104 
2105 out:
2106 	st->last_pos = *pos;
2107 	return rc;
2108 }
2109 
2110 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2111 {
2112 	struct tcp_iter_state *st = seq->private;
2113 	void *rc = NULL;
2114 
2115 	if (v == SEQ_START_TOKEN) {
2116 		rc = tcp_get_idx(seq, 0);
2117 		goto out;
2118 	}
2119 
2120 	switch (st->state) {
2121 	case TCP_SEQ_STATE_OPENREQ:
2122 	case TCP_SEQ_STATE_LISTENING:
2123 		rc = listening_get_next(seq, v);
2124 		if (!rc) {
2125 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2126 			st->bucket = 0;
2127 			st->offset = 0;
2128 			rc	  = established_get_first(seq);
2129 		}
2130 		break;
2131 	case TCP_SEQ_STATE_ESTABLISHED:
2132 		rc = established_get_next(seq, v);
2133 		break;
2134 	}
2135 out:
2136 	++*pos;
2137 	st->last_pos = *pos;
2138 	return rc;
2139 }
2140 
2141 static void tcp_seq_stop(struct seq_file *seq, void *v)
2142 {
2143 	struct tcp_iter_state *st = seq->private;
2144 
2145 	switch (st->state) {
2146 	case TCP_SEQ_STATE_OPENREQ:
2147 		if (v) {
2148 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2149 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2150 		}
2151 	case TCP_SEQ_STATE_LISTENING:
2152 		if (v != SEQ_START_TOKEN)
2153 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2154 		break;
2155 	case TCP_SEQ_STATE_ESTABLISHED:
2156 		if (v)
2157 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2158 		break;
2159 	}
2160 }
2161 
2162 int tcp_seq_open(struct inode *inode, struct file *file)
2163 {
2164 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2165 	struct tcp_iter_state *s;
2166 	int err;
2167 
2168 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2169 			  sizeof(struct tcp_iter_state));
2170 	if (err < 0)
2171 		return err;
2172 
2173 	s = ((struct seq_file *)file->private_data)->private;
2174 	s->family		= afinfo->family;
2175 	s->last_pos		= 0;
2176 	return 0;
2177 }
2178 EXPORT_SYMBOL(tcp_seq_open);
2179 
2180 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2181 {
2182 	int rc = 0;
2183 	struct proc_dir_entry *p;
2184 
2185 	afinfo->seq_ops.start		= tcp_seq_start;
2186 	afinfo->seq_ops.next		= tcp_seq_next;
2187 	afinfo->seq_ops.stop		= tcp_seq_stop;
2188 
2189 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2190 			     afinfo->seq_fops, afinfo);
2191 	if (!p)
2192 		rc = -ENOMEM;
2193 	return rc;
2194 }
2195 EXPORT_SYMBOL(tcp_proc_register);
2196 
2197 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2198 {
2199 	remove_proc_entry(afinfo->name, net->proc_net);
2200 }
2201 EXPORT_SYMBOL(tcp_proc_unregister);
2202 
2203 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2204 			 struct seq_file *f, int i, kuid_t uid)
2205 {
2206 	const struct inet_request_sock *ireq = inet_rsk(req);
2207 	long delta = req->expires - jiffies;
2208 
2209 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2210 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2211 		i,
2212 		ireq->ir_loc_addr,
2213 		ntohs(inet_sk(sk)->inet_sport),
2214 		ireq->ir_rmt_addr,
2215 		ntohs(ireq->ir_rmt_port),
2216 		TCP_SYN_RECV,
2217 		0, 0, /* could print option size, but that is af dependent. */
2218 		1,    /* timers active (only the expire timer) */
2219 		jiffies_delta_to_clock_t(delta),
2220 		req->num_timeout,
2221 		from_kuid_munged(seq_user_ns(f), uid),
2222 		0,  /* non standard timer */
2223 		0, /* open_requests have no inode */
2224 		atomic_read(&sk->sk_refcnt),
2225 		req);
2226 }
2227 
2228 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2229 {
2230 	int timer_active;
2231 	unsigned long timer_expires;
2232 	const struct tcp_sock *tp = tcp_sk(sk);
2233 	const struct inet_connection_sock *icsk = inet_csk(sk);
2234 	const struct inet_sock *inet = inet_sk(sk);
2235 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2236 	__be32 dest = inet->inet_daddr;
2237 	__be32 src = inet->inet_rcv_saddr;
2238 	__u16 destp = ntohs(inet->inet_dport);
2239 	__u16 srcp = ntohs(inet->inet_sport);
2240 	int rx_queue;
2241 
2242 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2243 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2244 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2245 		timer_active	= 1;
2246 		timer_expires	= icsk->icsk_timeout;
2247 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2248 		timer_active	= 4;
2249 		timer_expires	= icsk->icsk_timeout;
2250 	} else if (timer_pending(&sk->sk_timer)) {
2251 		timer_active	= 2;
2252 		timer_expires	= sk->sk_timer.expires;
2253 	} else {
2254 		timer_active	= 0;
2255 		timer_expires = jiffies;
2256 	}
2257 
2258 	if (sk->sk_state == TCP_LISTEN)
2259 		rx_queue = sk->sk_ack_backlog;
2260 	else
2261 		/*
2262 		 * because we dont lock socket, we might find a transient negative value
2263 		 */
2264 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2265 
2266 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2267 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2268 		i, src, srcp, dest, destp, sk->sk_state,
2269 		tp->write_seq - tp->snd_una,
2270 		rx_queue,
2271 		timer_active,
2272 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2273 		icsk->icsk_retransmits,
2274 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2275 		icsk->icsk_probes_out,
2276 		sock_i_ino(sk),
2277 		atomic_read(&sk->sk_refcnt), sk,
2278 		jiffies_to_clock_t(icsk->icsk_rto),
2279 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2280 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2281 		tp->snd_cwnd,
2282 		sk->sk_state == TCP_LISTEN ?
2283 		    (fastopenq ? fastopenq->max_qlen : 0) :
2284 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2285 }
2286 
2287 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2288 			       struct seq_file *f, int i)
2289 {
2290 	__be32 dest, src;
2291 	__u16 destp, srcp;
2292 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2293 
2294 	dest  = tw->tw_daddr;
2295 	src   = tw->tw_rcv_saddr;
2296 	destp = ntohs(tw->tw_dport);
2297 	srcp  = ntohs(tw->tw_sport);
2298 
2299 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2300 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2301 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2302 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2303 		atomic_read(&tw->tw_refcnt), tw);
2304 }
2305 
2306 #define TMPSZ 150
2307 
2308 static int tcp4_seq_show(struct seq_file *seq, void *v)
2309 {
2310 	struct tcp_iter_state *st;
2311 	struct sock *sk = v;
2312 
2313 	seq_setwidth(seq, TMPSZ - 1);
2314 	if (v == SEQ_START_TOKEN) {
2315 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2316 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2317 			   "inode");
2318 		goto out;
2319 	}
2320 	st = seq->private;
2321 
2322 	switch (st->state) {
2323 	case TCP_SEQ_STATE_LISTENING:
2324 	case TCP_SEQ_STATE_ESTABLISHED:
2325 		if (sk->sk_state == TCP_TIME_WAIT)
2326 			get_timewait4_sock(v, seq, st->num);
2327 		else
2328 			get_tcp4_sock(v, seq, st->num);
2329 		break;
2330 	case TCP_SEQ_STATE_OPENREQ:
2331 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2332 		break;
2333 	}
2334 out:
2335 	seq_pad(seq, '\n');
2336 	return 0;
2337 }
2338 
2339 static const struct file_operations tcp_afinfo_seq_fops = {
2340 	.owner   = THIS_MODULE,
2341 	.open    = tcp_seq_open,
2342 	.read    = seq_read,
2343 	.llseek  = seq_lseek,
2344 	.release = seq_release_net
2345 };
2346 
2347 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2348 	.name		= "tcp",
2349 	.family		= AF_INET,
2350 	.seq_fops	= &tcp_afinfo_seq_fops,
2351 	.seq_ops	= {
2352 		.show		= tcp4_seq_show,
2353 	},
2354 };
2355 
2356 static int __net_init tcp4_proc_init_net(struct net *net)
2357 {
2358 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2359 }
2360 
2361 static void __net_exit tcp4_proc_exit_net(struct net *net)
2362 {
2363 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2364 }
2365 
2366 static struct pernet_operations tcp4_net_ops = {
2367 	.init = tcp4_proc_init_net,
2368 	.exit = tcp4_proc_exit_net,
2369 };
2370 
2371 int __init tcp4_proc_init(void)
2372 {
2373 	return register_pernet_subsys(&tcp4_net_ops);
2374 }
2375 
2376 void tcp4_proc_exit(void)
2377 {
2378 	unregister_pernet_subsys(&tcp4_net_ops);
2379 }
2380 #endif /* CONFIG_PROC_FS */
2381 
2382 struct proto tcp_prot = {
2383 	.name			= "TCP",
2384 	.owner			= THIS_MODULE,
2385 	.close			= tcp_close,
2386 	.connect		= tcp_v4_connect,
2387 	.disconnect		= tcp_disconnect,
2388 	.accept			= inet_csk_accept,
2389 	.ioctl			= tcp_ioctl,
2390 	.init			= tcp_v4_init_sock,
2391 	.destroy		= tcp_v4_destroy_sock,
2392 	.shutdown		= tcp_shutdown,
2393 	.setsockopt		= tcp_setsockopt,
2394 	.getsockopt		= tcp_getsockopt,
2395 	.recvmsg		= tcp_recvmsg,
2396 	.sendmsg		= tcp_sendmsg,
2397 	.sendpage		= tcp_sendpage,
2398 	.backlog_rcv		= tcp_v4_do_rcv,
2399 	.release_cb		= tcp_release_cb,
2400 	.hash			= inet_hash,
2401 	.unhash			= inet_unhash,
2402 	.get_port		= inet_csk_get_port,
2403 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2404 	.stream_memory_free	= tcp_stream_memory_free,
2405 	.sockets_allocated	= &tcp_sockets_allocated,
2406 	.orphan_count		= &tcp_orphan_count,
2407 	.memory_allocated	= &tcp_memory_allocated,
2408 	.memory_pressure	= &tcp_memory_pressure,
2409 	.sysctl_mem		= sysctl_tcp_mem,
2410 	.sysctl_wmem		= sysctl_tcp_wmem,
2411 	.sysctl_rmem		= sysctl_tcp_rmem,
2412 	.max_header		= MAX_TCP_HEADER,
2413 	.obj_size		= sizeof(struct tcp_sock),
2414 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2415 	.twsk_prot		= &tcp_timewait_sock_ops,
2416 	.rsk_prot		= &tcp_request_sock_ops,
2417 	.h.hashinfo		= &tcp_hashinfo,
2418 	.no_autobind		= true,
2419 #ifdef CONFIG_COMPAT
2420 	.compat_setsockopt	= compat_tcp_setsockopt,
2421 	.compat_getsockopt	= compat_tcp_getsockopt,
2422 #endif
2423 #ifdef CONFIG_MEMCG_KMEM
2424 	.init_cgroup		= tcp_init_cgroup,
2425 	.destroy_cgroup		= tcp_destroy_cgroup,
2426 	.proto_cgroup		= tcp_proto_cgroup,
2427 #endif
2428 };
2429 EXPORT_SYMBOL(tcp_prot);
2430 
2431 static int __net_init tcp_sk_init(struct net *net)
2432 {
2433 	net->ipv4.sysctl_tcp_ecn = 2;
2434 	return 0;
2435 }
2436 
2437 static void __net_exit tcp_sk_exit(struct net *net)
2438 {
2439 }
2440 
2441 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2442 {
2443 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2444 }
2445 
2446 static struct pernet_operations __net_initdata tcp_sk_ops = {
2447        .init	   = tcp_sk_init,
2448        .exit	   = tcp_sk_exit,
2449        .exit_batch = tcp_sk_exit_batch,
2450 };
2451 
2452 void __init tcp_v4_init(void)
2453 {
2454 	inet_hashinfo_init(&tcp_hashinfo);
2455 	if (register_pernet_subsys(&tcp_sk_ops))
2456 		panic("Failed to create the TCP control socket.\n");
2457 }
2458