xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision 2d33394e23d63b750dcba40e5feaeba425427b52)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99 
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 					  ip_hdr(skb)->saddr,
104 					  tcp_hdr(skb)->dest,
105 					  tcp_hdr(skb)->source);
106 }
107 
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	/* With PAWS, it is safe from the viewpoint
114 	   of data integrity. Even without PAWS it is safe provided sequence
115 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 
117 	   Actually, the idea is close to VJ's one, only timestamp cache is
118 	   held not per host, but per port pair and TW bucket is used as state
119 	   holder.
120 
121 	   If TW bucket has been already destroyed we fall back to VJ's scheme
122 	   and use initial timestamp retrieved from peer table.
123 	 */
124 	if (tcptw->tw_ts_recent_stamp &&
125 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
126 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 		if (tp->write_seq == 0)
129 			tp->write_seq = 1;
130 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 		sock_hold(sktw);
133 		return 1;
134 	}
135 
136 	return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139 
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 	struct inet_sock *inet = inet_sk(sk);
145 	struct tcp_sock *tp = tcp_sk(sk);
146 	__be16 orig_sport, orig_dport;
147 	__be32 daddr, nexthop;
148 	struct flowi4 *fl4;
149 	struct rtable *rt;
150 	int err;
151 	struct ip_options_rcu *inet_opt;
152 
153 	if (addr_len < sizeof(struct sockaddr_in))
154 		return -EINVAL;
155 
156 	if (usin->sin_family != AF_INET)
157 		return -EAFNOSUPPORT;
158 
159 	nexthop = daddr = usin->sin_addr.s_addr;
160 	inet_opt = rcu_dereference_protected(inet->inet_opt,
161 					     sock_owned_by_user(sk));
162 	if (inet_opt && inet_opt->opt.srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet_opt->opt.faddr;
166 	}
167 
168 	orig_sport = inet->inet_sport;
169 	orig_dport = usin->sin_port;
170 	fl4 = &inet->cork.fl.u.ip4;
171 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			      IPPROTO_TCP,
174 			      orig_sport, orig_dport, sk);
175 	if (IS_ERR(rt)) {
176 		err = PTR_ERR(rt);
177 		if (err == -ENETUNREACH)
178 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 		return err;
180 	}
181 
182 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 		ip_rt_put(rt);
184 		return -ENETUNREACH;
185 	}
186 
187 	if (!inet_opt || !inet_opt->opt.srr)
188 		daddr = fl4->daddr;
189 
190 	if (!inet->inet_saddr)
191 		inet->inet_saddr = fl4->saddr;
192 	inet->inet_rcv_saddr = inet->inet_saddr;
193 
194 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 		/* Reset inherited state */
196 		tp->rx_opt.ts_recent	   = 0;
197 		tp->rx_opt.ts_recent_stamp = 0;
198 		if (likely(!tp->repair))
199 			tp->write_seq	   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 		tcp_fetch_timewait_stamp(sk, &rt->dst);
205 
206 	inet->inet_dport = usin->sin_port;
207 	inet->inet_daddr = daddr;
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(&tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	inet_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304 
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 	struct dst_entry *dst = __sk_dst_check(sk, 0);
308 
309 	if (dst)
310 		dst->ops->redirect(dst, sk, skb);
311 }
312 
313 /*
314  * This routine is called by the ICMP module when it gets some
315  * sort of error condition.  If err < 0 then the socket should
316  * be closed and the error returned to the user.  If err > 0
317  * it's just the icmp type << 8 | icmp code.  After adjustment
318  * header points to the first 8 bytes of the tcp header.  We need
319  * to find the appropriate port.
320  *
321  * The locking strategy used here is very "optimistic". When
322  * someone else accesses the socket the ICMP is just dropped
323  * and for some paths there is no check at all.
324  * A more general error queue to queue errors for later handling
325  * is probably better.
326  *
327  */
328 
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 {
331 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333 	struct inet_connection_sock *icsk;
334 	struct tcp_sock *tp;
335 	struct inet_sock *inet;
336 	const int type = icmp_hdr(icmp_skb)->type;
337 	const int code = icmp_hdr(icmp_skb)->code;
338 	struct sock *sk;
339 	struct sk_buff *skb;
340 	struct request_sock *fastopen;
341 	__u32 seq, snd_una;
342 	__u32 remaining;
343 	int err;
344 	struct net *net = dev_net(icmp_skb->dev);
345 
346 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
347 			iph->saddr, th->source, inet_iif(icmp_skb));
348 	if (!sk) {
349 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 		return;
351 	}
352 	if (sk->sk_state == TCP_TIME_WAIT) {
353 		inet_twsk_put(inet_twsk(sk));
354 		return;
355 	}
356 
357 	bh_lock_sock(sk);
358 	/* If too many ICMPs get dropped on busy
359 	 * servers this needs to be solved differently.
360 	 * We do take care of PMTU discovery (RFC1191) special case :
361 	 * we can receive locally generated ICMP messages while socket is held.
362 	 */
363 	if (sock_owned_by_user(sk)) {
364 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 	}
367 	if (sk->sk_state == TCP_CLOSE)
368 		goto out;
369 
370 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 		goto out;
373 	}
374 
375 	icsk = inet_csk(sk);
376 	tp = tcp_sk(sk);
377 	seq = ntohl(th->seq);
378 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 	fastopen = tp->fastopen_rsk;
380 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
381 	if (sk->sk_state != TCP_LISTEN &&
382 	    !between(seq, snd_una, tp->snd_nxt)) {
383 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
384 		goto out;
385 	}
386 
387 	switch (type) {
388 	case ICMP_REDIRECT:
389 		do_redirect(icmp_skb, sk);
390 		goto out;
391 	case ICMP_SOURCE_QUENCH:
392 		/* Just silently ignore these. */
393 		goto out;
394 	case ICMP_PARAMETERPROB:
395 		err = EPROTO;
396 		break;
397 	case ICMP_DEST_UNREACH:
398 		if (code > NR_ICMP_UNREACH)
399 			goto out;
400 
401 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 			/* We are not interested in TCP_LISTEN and open_requests
403 			 * (SYN-ACKs send out by Linux are always <576bytes so
404 			 * they should go through unfragmented).
405 			 */
406 			if (sk->sk_state == TCP_LISTEN)
407 				goto out;
408 
409 			tp->mtu_info = info;
410 			if (!sock_owned_by_user(sk)) {
411 				tcp_v4_mtu_reduced(sk);
412 			} else {
413 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 					sock_hold(sk);
415 			}
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff || fastopen)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 					       TCP_TIMEOUT_INIT;
434 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto -
440 			    min(icsk->icsk_rto,
441 				tcp_time_stamp - tcp_skb_timestamp(skb));
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
489 		goto out;
490 
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && fastopen->sk == NULL)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 	hash_location = tcp_parse_md5sig_option(th);
629 	if (!sk && hash_location) {
630 		/*
631 		 * active side is lost. Try to find listening socket through
632 		 * source port, and then find md5 key through listening socket.
633 		 * we are not loose security here:
634 		 * Incoming packet is checked with md5 hash with finding key,
635 		 * no RST generated if md5 hash doesn't match.
636 		 */
637 		sk1 = __inet_lookup_listener(net,
638 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
639 					     th->source, ip_hdr(skb)->daddr,
640 					     ntohs(th->source), inet_iif(skb));
641 		/* don't send rst if it can't find key */
642 		if (!sk1)
643 			return;
644 		rcu_read_lock();
645 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
646 					&ip_hdr(skb)->saddr, AF_INET);
647 		if (!key)
648 			goto release_sk1;
649 
650 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
651 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
652 			goto release_sk1;
653 	} else {
654 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
655 					     &ip_hdr(skb)->saddr,
656 					     AF_INET) : NULL;
657 	}
658 
659 	if (key) {
660 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661 				   (TCPOPT_NOP << 16) |
662 				   (TCPOPT_MD5SIG << 8) |
663 				   TCPOLEN_MD5SIG);
664 		/* Update length and the length the header thinks exists */
665 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666 		rep.th.doff = arg.iov[0].iov_len / 4;
667 
668 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669 				     key, ip_hdr(skb)->saddr,
670 				     ip_hdr(skb)->daddr, &rep.th);
671 	}
672 #endif
673 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674 				      ip_hdr(skb)->saddr, /* XXX */
675 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
676 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678 	/* When socket is gone, all binding information is lost.
679 	 * routing might fail in this case. No choice here, if we choose to force
680 	 * input interface, we will misroute in case of asymmetric route.
681 	 */
682 	if (sk)
683 		arg.bound_dev_if = sk->sk_bound_dev_if;
684 
685 	arg.tos = ip_hdr(skb)->tos;
686 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
687 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
688 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
689 			      &arg, arg.iov[0].iov_len);
690 
691 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
692 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
693 
694 #ifdef CONFIG_TCP_MD5SIG
695 release_sk1:
696 	if (sk1) {
697 		rcu_read_unlock();
698 		sock_put(sk1);
699 	}
700 #endif
701 }
702 
703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
704    outside socket context is ugly, certainly. What can I do?
705  */
706 
707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
708 			    u32 win, u32 tsval, u32 tsecr, int oif,
709 			    struct tcp_md5sig_key *key,
710 			    int reply_flags, u8 tos)
711 {
712 	const struct tcphdr *th = tcp_hdr(skb);
713 	struct {
714 		struct tcphdr th;
715 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
716 #ifdef CONFIG_TCP_MD5SIG
717 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
718 #endif
719 			];
720 	} rep;
721 	struct ip_reply_arg arg;
722 	struct net *net = dev_net(skb_dst(skb)->dev);
723 
724 	memset(&rep.th, 0, sizeof(struct tcphdr));
725 	memset(&arg, 0, sizeof(arg));
726 
727 	arg.iov[0].iov_base = (unsigned char *)&rep;
728 	arg.iov[0].iov_len  = sizeof(rep.th);
729 	if (tsecr) {
730 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
731 				   (TCPOPT_TIMESTAMP << 8) |
732 				   TCPOLEN_TIMESTAMP);
733 		rep.opt[1] = htonl(tsval);
734 		rep.opt[2] = htonl(tsecr);
735 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
736 	}
737 
738 	/* Swap the send and the receive. */
739 	rep.th.dest    = th->source;
740 	rep.th.source  = th->dest;
741 	rep.th.doff    = arg.iov[0].iov_len / 4;
742 	rep.th.seq     = htonl(seq);
743 	rep.th.ack_seq = htonl(ack);
744 	rep.th.ack     = 1;
745 	rep.th.window  = htons(win);
746 
747 #ifdef CONFIG_TCP_MD5SIG
748 	if (key) {
749 		int offset = (tsecr) ? 3 : 0;
750 
751 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
752 					  (TCPOPT_NOP << 16) |
753 					  (TCPOPT_MD5SIG << 8) |
754 					  TCPOLEN_MD5SIG);
755 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
756 		rep.th.doff = arg.iov[0].iov_len/4;
757 
758 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
759 				    key, ip_hdr(skb)->saddr,
760 				    ip_hdr(skb)->daddr, &rep.th);
761 	}
762 #endif
763 	arg.flags = reply_flags;
764 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 				      ip_hdr(skb)->saddr, /* XXX */
766 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
768 	if (oif)
769 		arg.bound_dev_if = oif;
770 	arg.tos = tos;
771 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
772 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
773 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
774 			      &arg, arg.iov[0].iov_len);
775 
776 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
777 }
778 
779 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
780 {
781 	struct inet_timewait_sock *tw = inet_twsk(sk);
782 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
783 
784 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
785 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
786 			tcp_time_stamp + tcptw->tw_ts_offset,
787 			tcptw->tw_ts_recent,
788 			tw->tw_bound_dev_if,
789 			tcp_twsk_md5_key(tcptw),
790 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
791 			tw->tw_tos
792 			);
793 
794 	inet_twsk_put(tw);
795 }
796 
797 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
798 				  struct request_sock *req)
799 {
800 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
801 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
802 	 */
803 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
804 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
805 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
806 			tcp_time_stamp,
807 			req->ts_recent,
808 			0,
809 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
810 					  AF_INET),
811 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
812 			ip_hdr(skb)->tos);
813 }
814 
815 /*
816  *	Send a SYN-ACK after having received a SYN.
817  *	This still operates on a request_sock only, not on a big
818  *	socket.
819  */
820 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
821 			      struct flowi *fl,
822 			      struct request_sock *req,
823 			      u16 queue_mapping,
824 			      struct tcp_fastopen_cookie *foc)
825 {
826 	const struct inet_request_sock *ireq = inet_rsk(req);
827 	struct flowi4 fl4;
828 	int err = -1;
829 	struct sk_buff *skb;
830 
831 	/* First, grab a route. */
832 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
833 		return -1;
834 
835 	skb = tcp_make_synack(sk, dst, req, foc);
836 
837 	if (skb) {
838 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
839 
840 		skb_set_queue_mapping(skb, queue_mapping);
841 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
842 					    ireq->ir_rmt_addr,
843 					    ireq->opt);
844 		err = net_xmit_eval(err);
845 	}
846 
847 	return err;
848 }
849 
850 /*
851  *	IPv4 request_sock destructor.
852  */
853 static void tcp_v4_reqsk_destructor(struct request_sock *req)
854 {
855 	kfree(inet_rsk(req)->opt);
856 }
857 
858 /*
859  * Return true if a syncookie should be sent
860  */
861 bool tcp_syn_flood_action(struct sock *sk,
862 			 const struct sk_buff *skb,
863 			 const char *proto)
864 {
865 	const char *msg = "Dropping request";
866 	bool want_cookie = false;
867 	struct listen_sock *lopt;
868 
869 #ifdef CONFIG_SYN_COOKIES
870 	if (sysctl_tcp_syncookies) {
871 		msg = "Sending cookies";
872 		want_cookie = true;
873 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874 	} else
875 #endif
876 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
877 
878 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
879 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
880 		lopt->synflood_warned = 1;
881 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
882 			proto, ntohs(tcp_hdr(skb)->dest), msg);
883 	}
884 	return want_cookie;
885 }
886 EXPORT_SYMBOL(tcp_syn_flood_action);
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 /*
890  * RFC2385 MD5 checksumming requires a mapping of
891  * IP address->MD5 Key.
892  * We need to maintain these in the sk structure.
893  */
894 
895 /* Find the Key structure for an address.  */
896 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
897 					 const union tcp_md5_addr *addr,
898 					 int family)
899 {
900 	struct tcp_sock *tp = tcp_sk(sk);
901 	struct tcp_md5sig_key *key;
902 	unsigned int size = sizeof(struct in_addr);
903 	struct tcp_md5sig_info *md5sig;
904 
905 	/* caller either holds rcu_read_lock() or socket lock */
906 	md5sig = rcu_dereference_check(tp->md5sig_info,
907 				       sock_owned_by_user(sk) ||
908 				       lockdep_is_held(&sk->sk_lock.slock));
909 	if (!md5sig)
910 		return NULL;
911 #if IS_ENABLED(CONFIG_IPV6)
912 	if (family == AF_INET6)
913 		size = sizeof(struct in6_addr);
914 #endif
915 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
916 		if (key->family != family)
917 			continue;
918 		if (!memcmp(&key->addr, addr, size))
919 			return key;
920 	}
921 	return NULL;
922 }
923 EXPORT_SYMBOL(tcp_md5_do_lookup);
924 
925 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
926 					 struct sock *addr_sk)
927 {
928 	union tcp_md5_addr *addr;
929 
930 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
931 	return tcp_md5_do_lookup(sk, addr, AF_INET);
932 }
933 EXPORT_SYMBOL(tcp_v4_md5_lookup);
934 
935 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
936 						      struct request_sock *req)
937 {
938 	union tcp_md5_addr *addr;
939 
940 	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
941 	return tcp_md5_do_lookup(sk, addr, AF_INET);
942 }
943 
944 /* This can be called on a newly created socket, from other files */
945 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
946 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
947 {
948 	/* Add Key to the list */
949 	struct tcp_md5sig_key *key;
950 	struct tcp_sock *tp = tcp_sk(sk);
951 	struct tcp_md5sig_info *md5sig;
952 
953 	key = tcp_md5_do_lookup(sk, addr, family);
954 	if (key) {
955 		/* Pre-existing entry - just update that one. */
956 		memcpy(key->key, newkey, newkeylen);
957 		key->keylen = newkeylen;
958 		return 0;
959 	}
960 
961 	md5sig = rcu_dereference_protected(tp->md5sig_info,
962 					   sock_owned_by_user(sk));
963 	if (!md5sig) {
964 		md5sig = kmalloc(sizeof(*md5sig), gfp);
965 		if (!md5sig)
966 			return -ENOMEM;
967 
968 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
969 		INIT_HLIST_HEAD(&md5sig->head);
970 		rcu_assign_pointer(tp->md5sig_info, md5sig);
971 	}
972 
973 	key = sock_kmalloc(sk, sizeof(*key), gfp);
974 	if (!key)
975 		return -ENOMEM;
976 	if (!tcp_alloc_md5sig_pool()) {
977 		sock_kfree_s(sk, key, sizeof(*key));
978 		return -ENOMEM;
979 	}
980 
981 	memcpy(key->key, newkey, newkeylen);
982 	key->keylen = newkeylen;
983 	key->family = family;
984 	memcpy(&key->addr, addr,
985 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
986 				      sizeof(struct in_addr));
987 	hlist_add_head_rcu(&key->node, &md5sig->head);
988 	return 0;
989 }
990 EXPORT_SYMBOL(tcp_md5_do_add);
991 
992 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
993 {
994 	struct tcp_md5sig_key *key;
995 
996 	key = tcp_md5_do_lookup(sk, addr, family);
997 	if (!key)
998 		return -ENOENT;
999 	hlist_del_rcu(&key->node);
1000 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1001 	kfree_rcu(key, rcu);
1002 	return 0;
1003 }
1004 EXPORT_SYMBOL(tcp_md5_do_del);
1005 
1006 static void tcp_clear_md5_list(struct sock *sk)
1007 {
1008 	struct tcp_sock *tp = tcp_sk(sk);
1009 	struct tcp_md5sig_key *key;
1010 	struct hlist_node *n;
1011 	struct tcp_md5sig_info *md5sig;
1012 
1013 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1014 
1015 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1016 		hlist_del_rcu(&key->node);
1017 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1018 		kfree_rcu(key, rcu);
1019 	}
1020 }
1021 
1022 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1023 				 int optlen)
1024 {
1025 	struct tcp_md5sig cmd;
1026 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1027 
1028 	if (optlen < sizeof(cmd))
1029 		return -EINVAL;
1030 
1031 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1032 		return -EFAULT;
1033 
1034 	if (sin->sin_family != AF_INET)
1035 		return -EINVAL;
1036 
1037 	if (!cmd.tcpm_keylen)
1038 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1039 				      AF_INET);
1040 
1041 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1042 		return -EINVAL;
1043 
1044 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1045 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1046 			      GFP_KERNEL);
1047 }
1048 
1049 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1050 					__be32 daddr, __be32 saddr, int nbytes)
1051 {
1052 	struct tcp4_pseudohdr *bp;
1053 	struct scatterlist sg;
1054 
1055 	bp = &hp->md5_blk.ip4;
1056 
1057 	/*
1058 	 * 1. the TCP pseudo-header (in the order: source IP address,
1059 	 * destination IP address, zero-padded protocol number, and
1060 	 * segment length)
1061 	 */
1062 	bp->saddr = saddr;
1063 	bp->daddr = daddr;
1064 	bp->pad = 0;
1065 	bp->protocol = IPPROTO_TCP;
1066 	bp->len = cpu_to_be16(nbytes);
1067 
1068 	sg_init_one(&sg, bp, sizeof(*bp));
1069 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1070 }
1071 
1072 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1073 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1074 {
1075 	struct tcp_md5sig_pool *hp;
1076 	struct hash_desc *desc;
1077 
1078 	hp = tcp_get_md5sig_pool();
1079 	if (!hp)
1080 		goto clear_hash_noput;
1081 	desc = &hp->md5_desc;
1082 
1083 	if (crypto_hash_init(desc))
1084 		goto clear_hash;
1085 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_header(hp, th))
1088 		goto clear_hash;
1089 	if (tcp_md5_hash_key(hp, key))
1090 		goto clear_hash;
1091 	if (crypto_hash_final(desc, md5_hash))
1092 		goto clear_hash;
1093 
1094 	tcp_put_md5sig_pool();
1095 	return 0;
1096 
1097 clear_hash:
1098 	tcp_put_md5sig_pool();
1099 clear_hash_noput:
1100 	memset(md5_hash, 0, 16);
1101 	return 1;
1102 }
1103 
1104 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1105 			const struct sock *sk, const struct request_sock *req,
1106 			const struct sk_buff *skb)
1107 {
1108 	struct tcp_md5sig_pool *hp;
1109 	struct hash_desc *desc;
1110 	const struct tcphdr *th = tcp_hdr(skb);
1111 	__be32 saddr, daddr;
1112 
1113 	if (sk) {
1114 		saddr = inet_sk(sk)->inet_saddr;
1115 		daddr = inet_sk(sk)->inet_daddr;
1116 	} else if (req) {
1117 		saddr = inet_rsk(req)->ir_loc_addr;
1118 		daddr = inet_rsk(req)->ir_rmt_addr;
1119 	} else {
1120 		const struct iphdr *iph = ip_hdr(skb);
1121 		saddr = iph->saddr;
1122 		daddr = iph->daddr;
1123 	}
1124 
1125 	hp = tcp_get_md5sig_pool();
1126 	if (!hp)
1127 		goto clear_hash_noput;
1128 	desc = &hp->md5_desc;
1129 
1130 	if (crypto_hash_init(desc))
1131 		goto clear_hash;
1132 
1133 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_header(hp, th))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1138 		goto clear_hash;
1139 	if (tcp_md5_hash_key(hp, key))
1140 		goto clear_hash;
1141 	if (crypto_hash_final(desc, md5_hash))
1142 		goto clear_hash;
1143 
1144 	tcp_put_md5sig_pool();
1145 	return 0;
1146 
1147 clear_hash:
1148 	tcp_put_md5sig_pool();
1149 clear_hash_noput:
1150 	memset(md5_hash, 0, 16);
1151 	return 1;
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1154 
1155 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1156 				      const struct sk_buff *skb)
1157 {
1158 	/*
1159 	 * This gets called for each TCP segment that arrives
1160 	 * so we want to be efficient.
1161 	 * We have 3 drop cases:
1162 	 * o No MD5 hash and one expected.
1163 	 * o MD5 hash and we're not expecting one.
1164 	 * o MD5 hash and its wrong.
1165 	 */
1166 	const __u8 *hash_location = NULL;
1167 	struct tcp_md5sig_key *hash_expected;
1168 	const struct iphdr *iph = ip_hdr(skb);
1169 	const struct tcphdr *th = tcp_hdr(skb);
1170 	int genhash;
1171 	unsigned char newhash[16];
1172 
1173 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1174 					  AF_INET);
1175 	hash_location = tcp_parse_md5sig_option(th);
1176 
1177 	/* We've parsed the options - do we have a hash? */
1178 	if (!hash_expected && !hash_location)
1179 		return false;
1180 
1181 	if (hash_expected && !hash_location) {
1182 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1183 		return true;
1184 	}
1185 
1186 	if (!hash_expected && hash_location) {
1187 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1188 		return true;
1189 	}
1190 
1191 	/* Okay, so this is hash_expected and hash_location -
1192 	 * so we need to calculate the checksum.
1193 	 */
1194 	genhash = tcp_v4_md5_hash_skb(newhash,
1195 				      hash_expected,
1196 				      NULL, NULL, skb);
1197 
1198 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1199 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1200 				     &iph->saddr, ntohs(th->source),
1201 				     &iph->daddr, ntohs(th->dest),
1202 				     genhash ? " tcp_v4_calc_md5_hash failed"
1203 				     : "");
1204 		return true;
1205 	}
1206 	return false;
1207 }
1208 
1209 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1210 {
1211 	bool ret;
1212 
1213 	rcu_read_lock();
1214 	ret = __tcp_v4_inbound_md5_hash(sk, skb);
1215 	rcu_read_unlock();
1216 
1217 	return ret;
1218 }
1219 
1220 #endif
1221 
1222 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1223 			    struct sk_buff *skb)
1224 {
1225 	struct inet_request_sock *ireq = inet_rsk(req);
1226 
1227 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1228 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1229 	ireq->no_srccheck = inet_sk(sk)->transparent;
1230 	ireq->opt = tcp_v4_save_options(skb);
1231 	ireq->ireq_family = AF_INET;
1232 }
1233 
1234 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1235 					  const struct request_sock *req,
1236 					  bool *strict)
1237 {
1238 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1239 
1240 	if (strict) {
1241 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1242 			*strict = true;
1243 		else
1244 			*strict = false;
1245 	}
1246 
1247 	return dst;
1248 }
1249 
1250 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1251 	.family		=	PF_INET,
1252 	.obj_size	=	sizeof(struct tcp_request_sock),
1253 	.rtx_syn_ack	=	tcp_rtx_synack,
1254 	.send_ack	=	tcp_v4_reqsk_send_ack,
1255 	.destructor	=	tcp_v4_reqsk_destructor,
1256 	.send_reset	=	tcp_v4_send_reset,
1257 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1258 };
1259 
1260 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1261 	.mss_clamp	=	TCP_MSS_DEFAULT,
1262 #ifdef CONFIG_TCP_MD5SIG
1263 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1264 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1265 #endif
1266 	.init_req	=	tcp_v4_init_req,
1267 #ifdef CONFIG_SYN_COOKIES
1268 	.cookie_init_seq =	cookie_v4_init_sequence,
1269 #endif
1270 	.route_req	=	tcp_v4_route_req,
1271 	.init_seq	=	tcp_v4_init_sequence,
1272 	.send_synack	=	tcp_v4_send_synack,
1273 	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,
1274 };
1275 
1276 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1277 {
1278 	/* Never answer to SYNs send to broadcast or multicast */
1279 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1280 		goto drop;
1281 
1282 	return tcp_conn_request(&tcp_request_sock_ops,
1283 				&tcp_request_sock_ipv4_ops, sk, skb);
1284 
1285 drop:
1286 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1287 	return 0;
1288 }
1289 EXPORT_SYMBOL(tcp_v4_conn_request);
1290 
1291 
1292 /*
1293  * The three way handshake has completed - we got a valid synack -
1294  * now create the new socket.
1295  */
1296 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1297 				  struct request_sock *req,
1298 				  struct dst_entry *dst)
1299 {
1300 	struct inet_request_sock *ireq;
1301 	struct inet_sock *newinet;
1302 	struct tcp_sock *newtp;
1303 	struct sock *newsk;
1304 #ifdef CONFIG_TCP_MD5SIG
1305 	struct tcp_md5sig_key *key;
1306 #endif
1307 	struct ip_options_rcu *inet_opt;
1308 
1309 	if (sk_acceptq_is_full(sk))
1310 		goto exit_overflow;
1311 
1312 	newsk = tcp_create_openreq_child(sk, req, skb);
1313 	if (!newsk)
1314 		goto exit_nonewsk;
1315 
1316 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1317 	inet_sk_rx_dst_set(newsk, skb);
1318 
1319 	newtp		      = tcp_sk(newsk);
1320 	newinet		      = inet_sk(newsk);
1321 	ireq		      = inet_rsk(req);
1322 	newinet->inet_daddr   = ireq->ir_rmt_addr;
1323 	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1324 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1325 	inet_opt	      = ireq->opt;
1326 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1327 	ireq->opt	      = NULL;
1328 	newinet->mc_index     = inet_iif(skb);
1329 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1330 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1331 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1332 	inet_set_txhash(newsk);
1333 	if (inet_opt)
1334 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1335 	newinet->inet_id = newtp->write_seq ^ jiffies;
1336 
1337 	if (!dst) {
1338 		dst = inet_csk_route_child_sock(sk, newsk, req);
1339 		if (!dst)
1340 			goto put_and_exit;
1341 	} else {
1342 		/* syncookie case : see end of cookie_v4_check() */
1343 	}
1344 	sk_setup_caps(newsk, dst);
1345 
1346 	tcp_ca_openreq_child(newsk, dst);
1347 
1348 	tcp_sync_mss(newsk, dst_mtu(dst));
1349 	newtp->advmss = dst_metric_advmss(dst);
1350 	if (tcp_sk(sk)->rx_opt.user_mss &&
1351 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1352 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1353 
1354 	tcp_initialize_rcv_mss(newsk);
1355 
1356 #ifdef CONFIG_TCP_MD5SIG
1357 	/* Copy over the MD5 key from the original socket */
1358 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1359 				AF_INET);
1360 	if (key != NULL) {
1361 		/*
1362 		 * We're using one, so create a matching key
1363 		 * on the newsk structure. If we fail to get
1364 		 * memory, then we end up not copying the key
1365 		 * across. Shucks.
1366 		 */
1367 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1368 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1369 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1370 	}
1371 #endif
1372 
1373 	if (__inet_inherit_port(sk, newsk) < 0)
1374 		goto put_and_exit;
1375 	__inet_hash_nolisten(newsk, NULL);
1376 
1377 	return newsk;
1378 
1379 exit_overflow:
1380 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1381 exit_nonewsk:
1382 	dst_release(dst);
1383 exit:
1384 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1385 	return NULL;
1386 put_and_exit:
1387 	inet_csk_prepare_forced_close(newsk);
1388 	tcp_done(newsk);
1389 	goto exit;
1390 }
1391 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1392 
1393 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1394 {
1395 	struct tcphdr *th = tcp_hdr(skb);
1396 	const struct iphdr *iph = ip_hdr(skb);
1397 	struct sock *nsk;
1398 	struct request_sock **prev;
1399 	/* Find possible connection requests. */
1400 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1401 						       iph->saddr, iph->daddr);
1402 	if (req)
1403 		return tcp_check_req(sk, skb, req, prev, false);
1404 
1405 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1406 			th->source, iph->daddr, th->dest, inet_iif(skb));
1407 
1408 	if (nsk) {
1409 		if (nsk->sk_state != TCP_TIME_WAIT) {
1410 			bh_lock_sock(nsk);
1411 			return nsk;
1412 		}
1413 		inet_twsk_put(inet_twsk(nsk));
1414 		return NULL;
1415 	}
1416 
1417 #ifdef CONFIG_SYN_COOKIES
1418 	if (!th->syn)
1419 		sk = cookie_v4_check(sk, skb);
1420 #endif
1421 	return sk;
1422 }
1423 
1424 /* The socket must have it's spinlock held when we get
1425  * here.
1426  *
1427  * We have a potential double-lock case here, so even when
1428  * doing backlog processing we use the BH locking scheme.
1429  * This is because we cannot sleep with the original spinlock
1430  * held.
1431  */
1432 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1433 {
1434 	struct sock *rsk;
1435 
1436 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1437 		struct dst_entry *dst = sk->sk_rx_dst;
1438 
1439 		sock_rps_save_rxhash(sk, skb);
1440 		sk_mark_napi_id(sk, skb);
1441 		if (dst) {
1442 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1443 			    dst->ops->check(dst, 0) == NULL) {
1444 				dst_release(dst);
1445 				sk->sk_rx_dst = NULL;
1446 			}
1447 		}
1448 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1449 		return 0;
1450 	}
1451 
1452 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1453 		goto csum_err;
1454 
1455 	if (sk->sk_state == TCP_LISTEN) {
1456 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1457 		if (!nsk)
1458 			goto discard;
1459 
1460 		if (nsk != sk) {
1461 			sock_rps_save_rxhash(nsk, skb);
1462 			sk_mark_napi_id(sk, skb);
1463 			if (tcp_child_process(sk, nsk, skb)) {
1464 				rsk = nsk;
1465 				goto reset;
1466 			}
1467 			return 0;
1468 		}
1469 	} else
1470 		sock_rps_save_rxhash(sk, skb);
1471 
1472 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1473 		rsk = sk;
1474 		goto reset;
1475 	}
1476 	return 0;
1477 
1478 reset:
1479 	tcp_v4_send_reset(rsk, skb);
1480 discard:
1481 	kfree_skb(skb);
1482 	/* Be careful here. If this function gets more complicated and
1483 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1484 	 * might be destroyed here. This current version compiles correctly,
1485 	 * but you have been warned.
1486 	 */
1487 	return 0;
1488 
1489 csum_err:
1490 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1491 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1492 	goto discard;
1493 }
1494 EXPORT_SYMBOL(tcp_v4_do_rcv);
1495 
1496 void tcp_v4_early_demux(struct sk_buff *skb)
1497 {
1498 	const struct iphdr *iph;
1499 	const struct tcphdr *th;
1500 	struct sock *sk;
1501 
1502 	if (skb->pkt_type != PACKET_HOST)
1503 		return;
1504 
1505 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1506 		return;
1507 
1508 	iph = ip_hdr(skb);
1509 	th = tcp_hdr(skb);
1510 
1511 	if (th->doff < sizeof(struct tcphdr) / 4)
1512 		return;
1513 
1514 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1515 				       iph->saddr, th->source,
1516 				       iph->daddr, ntohs(th->dest),
1517 				       skb->skb_iif);
1518 	if (sk) {
1519 		skb->sk = sk;
1520 		skb->destructor = sock_edemux;
1521 		if (sk_fullsock(sk)) {
1522 			struct dst_entry *dst = sk->sk_rx_dst;
1523 
1524 			if (dst)
1525 				dst = dst_check(dst, 0);
1526 			if (dst &&
1527 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1528 				skb_dst_set_noref(skb, dst);
1529 		}
1530 	}
1531 }
1532 
1533 /* Packet is added to VJ-style prequeue for processing in process
1534  * context, if a reader task is waiting. Apparently, this exciting
1535  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1536  * failed somewhere. Latency? Burstiness? Well, at least now we will
1537  * see, why it failed. 8)8)				  --ANK
1538  *
1539  */
1540 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1541 {
1542 	struct tcp_sock *tp = tcp_sk(sk);
1543 
1544 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1545 		return false;
1546 
1547 	if (skb->len <= tcp_hdrlen(skb) &&
1548 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1549 		return false;
1550 
1551 	/* Before escaping RCU protected region, we need to take care of skb
1552 	 * dst. Prequeue is only enabled for established sockets.
1553 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1554 	 * Instead of doing full sk_rx_dst validity here, let's perform
1555 	 * an optimistic check.
1556 	 */
1557 	if (likely(sk->sk_rx_dst))
1558 		skb_dst_drop(skb);
1559 	else
1560 		skb_dst_force(skb);
1561 
1562 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1563 	tp->ucopy.memory += skb->truesize;
1564 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1565 		struct sk_buff *skb1;
1566 
1567 		BUG_ON(sock_owned_by_user(sk));
1568 
1569 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1570 			sk_backlog_rcv(sk, skb1);
1571 			NET_INC_STATS_BH(sock_net(sk),
1572 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1573 		}
1574 
1575 		tp->ucopy.memory = 0;
1576 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1577 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1578 					   POLLIN | POLLRDNORM | POLLRDBAND);
1579 		if (!inet_csk_ack_scheduled(sk))
1580 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1581 						  (3 * tcp_rto_min(sk)) / 4,
1582 						  TCP_RTO_MAX);
1583 	}
1584 	return true;
1585 }
1586 EXPORT_SYMBOL(tcp_prequeue);
1587 
1588 /*
1589  *	From tcp_input.c
1590  */
1591 
1592 int tcp_v4_rcv(struct sk_buff *skb)
1593 {
1594 	const struct iphdr *iph;
1595 	const struct tcphdr *th;
1596 	struct sock *sk;
1597 	int ret;
1598 	struct net *net = dev_net(skb->dev);
1599 
1600 	if (skb->pkt_type != PACKET_HOST)
1601 		goto discard_it;
1602 
1603 	/* Count it even if it's bad */
1604 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1605 
1606 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1607 		goto discard_it;
1608 
1609 	th = tcp_hdr(skb);
1610 
1611 	if (th->doff < sizeof(struct tcphdr) / 4)
1612 		goto bad_packet;
1613 	if (!pskb_may_pull(skb, th->doff * 4))
1614 		goto discard_it;
1615 
1616 	/* An explanation is required here, I think.
1617 	 * Packet length and doff are validated by header prediction,
1618 	 * provided case of th->doff==0 is eliminated.
1619 	 * So, we defer the checks. */
1620 
1621 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1622 		goto csum_error;
1623 
1624 	th = tcp_hdr(skb);
1625 	iph = ip_hdr(skb);
1626 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1627 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1628 	 */
1629 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1630 		sizeof(struct inet_skb_parm));
1631 	barrier();
1632 
1633 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1634 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1635 				    skb->len - th->doff * 4);
1636 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1637 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1638 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1639 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1640 	TCP_SKB_CB(skb)->sacked	 = 0;
1641 
1642 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1643 	if (!sk)
1644 		goto no_tcp_socket;
1645 
1646 process:
1647 	if (sk->sk_state == TCP_TIME_WAIT)
1648 		goto do_time_wait;
1649 
1650 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1651 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1652 		goto discard_and_relse;
1653 	}
1654 
1655 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1656 		goto discard_and_relse;
1657 
1658 #ifdef CONFIG_TCP_MD5SIG
1659 	/*
1660 	 * We really want to reject the packet as early as possible
1661 	 * if:
1662 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1663 	 *  o There is an MD5 option and we're not expecting one
1664 	 */
1665 	if (tcp_v4_inbound_md5_hash(sk, skb))
1666 		goto discard_and_relse;
1667 #endif
1668 
1669 	nf_reset(skb);
1670 
1671 	if (sk_filter(sk, skb))
1672 		goto discard_and_relse;
1673 
1674 	sk_incoming_cpu_update(sk);
1675 	skb->dev = NULL;
1676 
1677 	bh_lock_sock_nested(sk);
1678 	ret = 0;
1679 	if (!sock_owned_by_user(sk)) {
1680 		if (!tcp_prequeue(sk, skb))
1681 			ret = tcp_v4_do_rcv(sk, skb);
1682 	} else if (unlikely(sk_add_backlog(sk, skb,
1683 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1684 		bh_unlock_sock(sk);
1685 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1686 		goto discard_and_relse;
1687 	}
1688 	bh_unlock_sock(sk);
1689 
1690 	sock_put(sk);
1691 
1692 	return ret;
1693 
1694 no_tcp_socket:
1695 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1696 		goto discard_it;
1697 
1698 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1699 csum_error:
1700 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1701 bad_packet:
1702 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1703 	} else {
1704 		tcp_v4_send_reset(NULL, skb);
1705 	}
1706 
1707 discard_it:
1708 	/* Discard frame. */
1709 	kfree_skb(skb);
1710 	return 0;
1711 
1712 discard_and_relse:
1713 	sock_put(sk);
1714 	goto discard_it;
1715 
1716 do_time_wait:
1717 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1718 		inet_twsk_put(inet_twsk(sk));
1719 		goto discard_it;
1720 	}
1721 
1722 	if (skb->len < (th->doff << 2)) {
1723 		inet_twsk_put(inet_twsk(sk));
1724 		goto bad_packet;
1725 	}
1726 	if (tcp_checksum_complete(skb)) {
1727 		inet_twsk_put(inet_twsk(sk));
1728 		goto csum_error;
1729 	}
1730 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1731 	case TCP_TW_SYN: {
1732 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1733 							&tcp_hashinfo,
1734 							iph->saddr, th->source,
1735 							iph->daddr, th->dest,
1736 							inet_iif(skb));
1737 		if (sk2) {
1738 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1739 			inet_twsk_put(inet_twsk(sk));
1740 			sk = sk2;
1741 			goto process;
1742 		}
1743 		/* Fall through to ACK */
1744 	}
1745 	case TCP_TW_ACK:
1746 		tcp_v4_timewait_ack(sk, skb);
1747 		break;
1748 	case TCP_TW_RST:
1749 		goto no_tcp_socket;
1750 	case TCP_TW_SUCCESS:;
1751 	}
1752 	goto discard_it;
1753 }
1754 
1755 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1756 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1757 	.twsk_unique	= tcp_twsk_unique,
1758 	.twsk_destructor= tcp_twsk_destructor,
1759 };
1760 
1761 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1762 {
1763 	struct dst_entry *dst = skb_dst(skb);
1764 
1765 	if (dst) {
1766 		dst_hold(dst);
1767 		sk->sk_rx_dst = dst;
1768 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1769 	}
1770 }
1771 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1772 
1773 const struct inet_connection_sock_af_ops ipv4_specific = {
1774 	.queue_xmit	   = ip_queue_xmit,
1775 	.send_check	   = tcp_v4_send_check,
1776 	.rebuild_header	   = inet_sk_rebuild_header,
1777 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1778 	.conn_request	   = tcp_v4_conn_request,
1779 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1780 	.net_header_len	   = sizeof(struct iphdr),
1781 	.setsockopt	   = ip_setsockopt,
1782 	.getsockopt	   = ip_getsockopt,
1783 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1784 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1785 	.bind_conflict	   = inet_csk_bind_conflict,
1786 #ifdef CONFIG_COMPAT
1787 	.compat_setsockopt = compat_ip_setsockopt,
1788 	.compat_getsockopt = compat_ip_getsockopt,
1789 #endif
1790 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1791 };
1792 EXPORT_SYMBOL(ipv4_specific);
1793 
1794 #ifdef CONFIG_TCP_MD5SIG
1795 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1796 	.md5_lookup		= tcp_v4_md5_lookup,
1797 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1798 	.md5_parse		= tcp_v4_parse_md5_keys,
1799 };
1800 #endif
1801 
1802 /* NOTE: A lot of things set to zero explicitly by call to
1803  *       sk_alloc() so need not be done here.
1804  */
1805 static int tcp_v4_init_sock(struct sock *sk)
1806 {
1807 	struct inet_connection_sock *icsk = inet_csk(sk);
1808 
1809 	tcp_init_sock(sk);
1810 
1811 	icsk->icsk_af_ops = &ipv4_specific;
1812 
1813 #ifdef CONFIG_TCP_MD5SIG
1814 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1815 #endif
1816 
1817 	return 0;
1818 }
1819 
1820 void tcp_v4_destroy_sock(struct sock *sk)
1821 {
1822 	struct tcp_sock *tp = tcp_sk(sk);
1823 
1824 	tcp_clear_xmit_timers(sk);
1825 
1826 	tcp_cleanup_congestion_control(sk);
1827 
1828 	/* Cleanup up the write buffer. */
1829 	tcp_write_queue_purge(sk);
1830 
1831 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1832 	__skb_queue_purge(&tp->out_of_order_queue);
1833 
1834 #ifdef CONFIG_TCP_MD5SIG
1835 	/* Clean up the MD5 key list, if any */
1836 	if (tp->md5sig_info) {
1837 		tcp_clear_md5_list(sk);
1838 		kfree_rcu(tp->md5sig_info, rcu);
1839 		tp->md5sig_info = NULL;
1840 	}
1841 #endif
1842 
1843 	/* Clean prequeue, it must be empty really */
1844 	__skb_queue_purge(&tp->ucopy.prequeue);
1845 
1846 	/* Clean up a referenced TCP bind bucket. */
1847 	if (inet_csk(sk)->icsk_bind_hash)
1848 		inet_put_port(sk);
1849 
1850 	BUG_ON(tp->fastopen_rsk != NULL);
1851 
1852 	/* If socket is aborted during connect operation */
1853 	tcp_free_fastopen_req(tp);
1854 
1855 	sk_sockets_allocated_dec(sk);
1856 	sock_release_memcg(sk);
1857 }
1858 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1859 
1860 #ifdef CONFIG_PROC_FS
1861 /* Proc filesystem TCP sock list dumping. */
1862 
1863 /*
1864  * Get next listener socket follow cur.  If cur is NULL, get first socket
1865  * starting from bucket given in st->bucket; when st->bucket is zero the
1866  * very first socket in the hash table is returned.
1867  */
1868 static void *listening_get_next(struct seq_file *seq, void *cur)
1869 {
1870 	struct inet_connection_sock *icsk;
1871 	struct hlist_nulls_node *node;
1872 	struct sock *sk = cur;
1873 	struct inet_listen_hashbucket *ilb;
1874 	struct tcp_iter_state *st = seq->private;
1875 	struct net *net = seq_file_net(seq);
1876 
1877 	if (!sk) {
1878 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1879 		spin_lock_bh(&ilb->lock);
1880 		sk = sk_nulls_head(&ilb->head);
1881 		st->offset = 0;
1882 		goto get_sk;
1883 	}
1884 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1885 	++st->num;
1886 	++st->offset;
1887 
1888 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1889 		struct request_sock *req = cur;
1890 
1891 		icsk = inet_csk(st->syn_wait_sk);
1892 		req = req->dl_next;
1893 		while (1) {
1894 			while (req) {
1895 				if (req->rsk_ops->family == st->family) {
1896 					cur = req;
1897 					goto out;
1898 				}
1899 				req = req->dl_next;
1900 			}
1901 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1902 				break;
1903 get_req:
1904 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1905 		}
1906 		sk	  = sk_nulls_next(st->syn_wait_sk);
1907 		st->state = TCP_SEQ_STATE_LISTENING;
1908 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909 	} else {
1910 		icsk = inet_csk(sk);
1911 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1912 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1913 			goto start_req;
1914 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1915 		sk = sk_nulls_next(sk);
1916 	}
1917 get_sk:
1918 	sk_nulls_for_each_from(sk, node) {
1919 		if (!net_eq(sock_net(sk), net))
1920 			continue;
1921 		if (sk->sk_family == st->family) {
1922 			cur = sk;
1923 			goto out;
1924 		}
1925 		icsk = inet_csk(sk);
1926 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1927 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1928 start_req:
1929 			st->uid		= sock_i_uid(sk);
1930 			st->syn_wait_sk = sk;
1931 			st->state	= TCP_SEQ_STATE_OPENREQ;
1932 			st->sbucket	= 0;
1933 			goto get_req;
1934 		}
1935 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1936 	}
1937 	spin_unlock_bh(&ilb->lock);
1938 	st->offset = 0;
1939 	if (++st->bucket < INET_LHTABLE_SIZE) {
1940 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1941 		spin_lock_bh(&ilb->lock);
1942 		sk = sk_nulls_head(&ilb->head);
1943 		goto get_sk;
1944 	}
1945 	cur = NULL;
1946 out:
1947 	return cur;
1948 }
1949 
1950 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1951 {
1952 	struct tcp_iter_state *st = seq->private;
1953 	void *rc;
1954 
1955 	st->bucket = 0;
1956 	st->offset = 0;
1957 	rc = listening_get_next(seq, NULL);
1958 
1959 	while (rc && *pos) {
1960 		rc = listening_get_next(seq, rc);
1961 		--*pos;
1962 	}
1963 	return rc;
1964 }
1965 
1966 static inline bool empty_bucket(const struct tcp_iter_state *st)
1967 {
1968 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1969 }
1970 
1971 /*
1972  * Get first established socket starting from bucket given in st->bucket.
1973  * If st->bucket is zero, the very first socket in the hash is returned.
1974  */
1975 static void *established_get_first(struct seq_file *seq)
1976 {
1977 	struct tcp_iter_state *st = seq->private;
1978 	struct net *net = seq_file_net(seq);
1979 	void *rc = NULL;
1980 
1981 	st->offset = 0;
1982 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1983 		struct sock *sk;
1984 		struct hlist_nulls_node *node;
1985 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1986 
1987 		/* Lockless fast path for the common case of empty buckets */
1988 		if (empty_bucket(st))
1989 			continue;
1990 
1991 		spin_lock_bh(lock);
1992 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1993 			if (sk->sk_family != st->family ||
1994 			    !net_eq(sock_net(sk), net)) {
1995 				continue;
1996 			}
1997 			rc = sk;
1998 			goto out;
1999 		}
2000 		spin_unlock_bh(lock);
2001 	}
2002 out:
2003 	return rc;
2004 }
2005 
2006 static void *established_get_next(struct seq_file *seq, void *cur)
2007 {
2008 	struct sock *sk = cur;
2009 	struct hlist_nulls_node *node;
2010 	struct tcp_iter_state *st = seq->private;
2011 	struct net *net = seq_file_net(seq);
2012 
2013 	++st->num;
2014 	++st->offset;
2015 
2016 	sk = sk_nulls_next(sk);
2017 
2018 	sk_nulls_for_each_from(sk, node) {
2019 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2020 			return sk;
2021 	}
2022 
2023 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2024 	++st->bucket;
2025 	return established_get_first(seq);
2026 }
2027 
2028 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2029 {
2030 	struct tcp_iter_state *st = seq->private;
2031 	void *rc;
2032 
2033 	st->bucket = 0;
2034 	rc = established_get_first(seq);
2035 
2036 	while (rc && pos) {
2037 		rc = established_get_next(seq, rc);
2038 		--pos;
2039 	}
2040 	return rc;
2041 }
2042 
2043 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2044 {
2045 	void *rc;
2046 	struct tcp_iter_state *st = seq->private;
2047 
2048 	st->state = TCP_SEQ_STATE_LISTENING;
2049 	rc	  = listening_get_idx(seq, &pos);
2050 
2051 	if (!rc) {
2052 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2053 		rc	  = established_get_idx(seq, pos);
2054 	}
2055 
2056 	return rc;
2057 }
2058 
2059 static void *tcp_seek_last_pos(struct seq_file *seq)
2060 {
2061 	struct tcp_iter_state *st = seq->private;
2062 	int offset = st->offset;
2063 	int orig_num = st->num;
2064 	void *rc = NULL;
2065 
2066 	switch (st->state) {
2067 	case TCP_SEQ_STATE_OPENREQ:
2068 	case TCP_SEQ_STATE_LISTENING:
2069 		if (st->bucket >= INET_LHTABLE_SIZE)
2070 			break;
2071 		st->state = TCP_SEQ_STATE_LISTENING;
2072 		rc = listening_get_next(seq, NULL);
2073 		while (offset-- && rc)
2074 			rc = listening_get_next(seq, rc);
2075 		if (rc)
2076 			break;
2077 		st->bucket = 0;
2078 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2079 		/* Fallthrough */
2080 	case TCP_SEQ_STATE_ESTABLISHED:
2081 		if (st->bucket > tcp_hashinfo.ehash_mask)
2082 			break;
2083 		rc = established_get_first(seq);
2084 		while (offset-- && rc)
2085 			rc = established_get_next(seq, rc);
2086 	}
2087 
2088 	st->num = orig_num;
2089 
2090 	return rc;
2091 }
2092 
2093 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2094 {
2095 	struct tcp_iter_state *st = seq->private;
2096 	void *rc;
2097 
2098 	if (*pos && *pos == st->last_pos) {
2099 		rc = tcp_seek_last_pos(seq);
2100 		if (rc)
2101 			goto out;
2102 	}
2103 
2104 	st->state = TCP_SEQ_STATE_LISTENING;
2105 	st->num = 0;
2106 	st->bucket = 0;
2107 	st->offset = 0;
2108 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2109 
2110 out:
2111 	st->last_pos = *pos;
2112 	return rc;
2113 }
2114 
2115 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2116 {
2117 	struct tcp_iter_state *st = seq->private;
2118 	void *rc = NULL;
2119 
2120 	if (v == SEQ_START_TOKEN) {
2121 		rc = tcp_get_idx(seq, 0);
2122 		goto out;
2123 	}
2124 
2125 	switch (st->state) {
2126 	case TCP_SEQ_STATE_OPENREQ:
2127 	case TCP_SEQ_STATE_LISTENING:
2128 		rc = listening_get_next(seq, v);
2129 		if (!rc) {
2130 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2131 			st->bucket = 0;
2132 			st->offset = 0;
2133 			rc	  = established_get_first(seq);
2134 		}
2135 		break;
2136 	case TCP_SEQ_STATE_ESTABLISHED:
2137 		rc = established_get_next(seq, v);
2138 		break;
2139 	}
2140 out:
2141 	++*pos;
2142 	st->last_pos = *pos;
2143 	return rc;
2144 }
2145 
2146 static void tcp_seq_stop(struct seq_file *seq, void *v)
2147 {
2148 	struct tcp_iter_state *st = seq->private;
2149 
2150 	switch (st->state) {
2151 	case TCP_SEQ_STATE_OPENREQ:
2152 		if (v) {
2153 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2154 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2155 		}
2156 	case TCP_SEQ_STATE_LISTENING:
2157 		if (v != SEQ_START_TOKEN)
2158 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2159 		break;
2160 	case TCP_SEQ_STATE_ESTABLISHED:
2161 		if (v)
2162 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2163 		break;
2164 	}
2165 }
2166 
2167 int tcp_seq_open(struct inode *inode, struct file *file)
2168 {
2169 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2170 	struct tcp_iter_state *s;
2171 	int err;
2172 
2173 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2174 			  sizeof(struct tcp_iter_state));
2175 	if (err < 0)
2176 		return err;
2177 
2178 	s = ((struct seq_file *)file->private_data)->private;
2179 	s->family		= afinfo->family;
2180 	s->last_pos		= 0;
2181 	return 0;
2182 }
2183 EXPORT_SYMBOL(tcp_seq_open);
2184 
2185 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2186 {
2187 	int rc = 0;
2188 	struct proc_dir_entry *p;
2189 
2190 	afinfo->seq_ops.start		= tcp_seq_start;
2191 	afinfo->seq_ops.next		= tcp_seq_next;
2192 	afinfo->seq_ops.stop		= tcp_seq_stop;
2193 
2194 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2195 			     afinfo->seq_fops, afinfo);
2196 	if (!p)
2197 		rc = -ENOMEM;
2198 	return rc;
2199 }
2200 EXPORT_SYMBOL(tcp_proc_register);
2201 
2202 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2203 {
2204 	remove_proc_entry(afinfo->name, net->proc_net);
2205 }
2206 EXPORT_SYMBOL(tcp_proc_unregister);
2207 
2208 static void get_openreq4(const struct request_sock *req,
2209 			 struct seq_file *f, int i, kuid_t uid)
2210 {
2211 	const struct inet_request_sock *ireq = inet_rsk(req);
2212 	long delta = req->expires - jiffies;
2213 
2214 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2215 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2216 		i,
2217 		ireq->ir_loc_addr,
2218 		ireq->ir_num,
2219 		ireq->ir_rmt_addr,
2220 		ntohs(ireq->ir_rmt_port),
2221 		TCP_SYN_RECV,
2222 		0, 0, /* could print option size, but that is af dependent. */
2223 		1,    /* timers active (only the expire timer) */
2224 		jiffies_delta_to_clock_t(delta),
2225 		req->num_timeout,
2226 		from_kuid_munged(seq_user_ns(f), uid),
2227 		0,  /* non standard timer */
2228 		0, /* open_requests have no inode */
2229 		0,
2230 		req);
2231 }
2232 
2233 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2234 {
2235 	int timer_active;
2236 	unsigned long timer_expires;
2237 	const struct tcp_sock *tp = tcp_sk(sk);
2238 	const struct inet_connection_sock *icsk = inet_csk(sk);
2239 	const struct inet_sock *inet = inet_sk(sk);
2240 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2241 	__be32 dest = inet->inet_daddr;
2242 	__be32 src = inet->inet_rcv_saddr;
2243 	__u16 destp = ntohs(inet->inet_dport);
2244 	__u16 srcp = ntohs(inet->inet_sport);
2245 	int rx_queue;
2246 
2247 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2248 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2249 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2250 		timer_active	= 1;
2251 		timer_expires	= icsk->icsk_timeout;
2252 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2253 		timer_active	= 4;
2254 		timer_expires	= icsk->icsk_timeout;
2255 	} else if (timer_pending(&sk->sk_timer)) {
2256 		timer_active	= 2;
2257 		timer_expires	= sk->sk_timer.expires;
2258 	} else {
2259 		timer_active	= 0;
2260 		timer_expires = jiffies;
2261 	}
2262 
2263 	if (sk->sk_state == TCP_LISTEN)
2264 		rx_queue = sk->sk_ack_backlog;
2265 	else
2266 		/*
2267 		 * because we dont lock socket, we might find a transient negative value
2268 		 */
2269 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2270 
2271 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2272 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2273 		i, src, srcp, dest, destp, sk->sk_state,
2274 		tp->write_seq - tp->snd_una,
2275 		rx_queue,
2276 		timer_active,
2277 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2278 		icsk->icsk_retransmits,
2279 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2280 		icsk->icsk_probes_out,
2281 		sock_i_ino(sk),
2282 		atomic_read(&sk->sk_refcnt), sk,
2283 		jiffies_to_clock_t(icsk->icsk_rto),
2284 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2285 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2286 		tp->snd_cwnd,
2287 		sk->sk_state == TCP_LISTEN ?
2288 		    (fastopenq ? fastopenq->max_qlen : 0) :
2289 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2290 }
2291 
2292 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2293 			       struct seq_file *f, int i)
2294 {
2295 	__be32 dest, src;
2296 	__u16 destp, srcp;
2297 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2298 
2299 	dest  = tw->tw_daddr;
2300 	src   = tw->tw_rcv_saddr;
2301 	destp = ntohs(tw->tw_dport);
2302 	srcp  = ntohs(tw->tw_sport);
2303 
2304 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2305 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2306 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2307 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2308 		atomic_read(&tw->tw_refcnt), tw);
2309 }
2310 
2311 #define TMPSZ 150
2312 
2313 static int tcp4_seq_show(struct seq_file *seq, void *v)
2314 {
2315 	struct tcp_iter_state *st;
2316 	struct sock *sk = v;
2317 
2318 	seq_setwidth(seq, TMPSZ - 1);
2319 	if (v == SEQ_START_TOKEN) {
2320 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2321 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2322 			   "inode");
2323 		goto out;
2324 	}
2325 	st = seq->private;
2326 
2327 	switch (st->state) {
2328 	case TCP_SEQ_STATE_LISTENING:
2329 	case TCP_SEQ_STATE_ESTABLISHED:
2330 		if (sk->sk_state == TCP_TIME_WAIT)
2331 			get_timewait4_sock(v, seq, st->num);
2332 		else
2333 			get_tcp4_sock(v, seq, st->num);
2334 		break;
2335 	case TCP_SEQ_STATE_OPENREQ:
2336 		get_openreq4(v, seq, st->num, st->uid);
2337 		break;
2338 	}
2339 out:
2340 	seq_pad(seq, '\n');
2341 	return 0;
2342 }
2343 
2344 static const struct file_operations tcp_afinfo_seq_fops = {
2345 	.owner   = THIS_MODULE,
2346 	.open    = tcp_seq_open,
2347 	.read    = seq_read,
2348 	.llseek  = seq_lseek,
2349 	.release = seq_release_net
2350 };
2351 
2352 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2353 	.name		= "tcp",
2354 	.family		= AF_INET,
2355 	.seq_fops	= &tcp_afinfo_seq_fops,
2356 	.seq_ops	= {
2357 		.show		= tcp4_seq_show,
2358 	},
2359 };
2360 
2361 static int __net_init tcp4_proc_init_net(struct net *net)
2362 {
2363 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2364 }
2365 
2366 static void __net_exit tcp4_proc_exit_net(struct net *net)
2367 {
2368 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2369 }
2370 
2371 static struct pernet_operations tcp4_net_ops = {
2372 	.init = tcp4_proc_init_net,
2373 	.exit = tcp4_proc_exit_net,
2374 };
2375 
2376 int __init tcp4_proc_init(void)
2377 {
2378 	return register_pernet_subsys(&tcp4_net_ops);
2379 }
2380 
2381 void tcp4_proc_exit(void)
2382 {
2383 	unregister_pernet_subsys(&tcp4_net_ops);
2384 }
2385 #endif /* CONFIG_PROC_FS */
2386 
2387 struct proto tcp_prot = {
2388 	.name			= "TCP",
2389 	.owner			= THIS_MODULE,
2390 	.close			= tcp_close,
2391 	.connect		= tcp_v4_connect,
2392 	.disconnect		= tcp_disconnect,
2393 	.accept			= inet_csk_accept,
2394 	.ioctl			= tcp_ioctl,
2395 	.init			= tcp_v4_init_sock,
2396 	.destroy		= tcp_v4_destroy_sock,
2397 	.shutdown		= tcp_shutdown,
2398 	.setsockopt		= tcp_setsockopt,
2399 	.getsockopt		= tcp_getsockopt,
2400 	.recvmsg		= tcp_recvmsg,
2401 	.sendmsg		= tcp_sendmsg,
2402 	.sendpage		= tcp_sendpage,
2403 	.backlog_rcv		= tcp_v4_do_rcv,
2404 	.release_cb		= tcp_release_cb,
2405 	.hash			= inet_hash,
2406 	.unhash			= inet_unhash,
2407 	.get_port		= inet_csk_get_port,
2408 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2409 	.stream_memory_free	= tcp_stream_memory_free,
2410 	.sockets_allocated	= &tcp_sockets_allocated,
2411 	.orphan_count		= &tcp_orphan_count,
2412 	.memory_allocated	= &tcp_memory_allocated,
2413 	.memory_pressure	= &tcp_memory_pressure,
2414 	.sysctl_mem		= sysctl_tcp_mem,
2415 	.sysctl_wmem		= sysctl_tcp_wmem,
2416 	.sysctl_rmem		= sysctl_tcp_rmem,
2417 	.max_header		= MAX_TCP_HEADER,
2418 	.obj_size		= sizeof(struct tcp_sock),
2419 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2420 	.twsk_prot		= &tcp_timewait_sock_ops,
2421 	.rsk_prot		= &tcp_request_sock_ops,
2422 	.h.hashinfo		= &tcp_hashinfo,
2423 	.no_autobind		= true,
2424 #ifdef CONFIG_COMPAT
2425 	.compat_setsockopt	= compat_tcp_setsockopt,
2426 	.compat_getsockopt	= compat_tcp_getsockopt,
2427 #endif
2428 #ifdef CONFIG_MEMCG_KMEM
2429 	.init_cgroup		= tcp_init_cgroup,
2430 	.destroy_cgroup		= tcp_destroy_cgroup,
2431 	.proto_cgroup		= tcp_proto_cgroup,
2432 #endif
2433 };
2434 EXPORT_SYMBOL(tcp_prot);
2435 
2436 static void __net_exit tcp_sk_exit(struct net *net)
2437 {
2438 	int cpu;
2439 
2440 	for_each_possible_cpu(cpu)
2441 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2442 	free_percpu(net->ipv4.tcp_sk);
2443 }
2444 
2445 static int __net_init tcp_sk_init(struct net *net)
2446 {
2447 	int res, cpu;
2448 
2449 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2450 	if (!net->ipv4.tcp_sk)
2451 		return -ENOMEM;
2452 
2453 	for_each_possible_cpu(cpu) {
2454 		struct sock *sk;
2455 
2456 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2457 					   IPPROTO_TCP, net);
2458 		if (res)
2459 			goto fail;
2460 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2461 	}
2462 	net->ipv4.sysctl_tcp_ecn = 2;
2463 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2464 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2465 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2466 	return 0;
2467 
2468 fail:
2469 	tcp_sk_exit(net);
2470 
2471 	return res;
2472 }
2473 
2474 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2475 {
2476 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2477 }
2478 
2479 static struct pernet_operations __net_initdata tcp_sk_ops = {
2480        .init	   = tcp_sk_init,
2481        .exit	   = tcp_sk_exit,
2482        .exit_batch = tcp_sk_exit_batch,
2483 };
2484 
2485 void __init tcp_v4_init(void)
2486 {
2487 	inet_hashinfo_init(&tcp_hashinfo);
2488 	if (register_pernet_subsys(&tcp_sk_ops))
2489 		panic("Failed to create the TCP control socket.\n");
2490 }
2491