xref: /openbmc/linux/net/ipv4/tcp_ipv4.c (revision f7c35abe)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_low_latency __read_mostly;
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
98 {
99 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
100 					  ip_hdr(skb)->saddr,
101 					  tcp_hdr(skb)->dest,
102 					  tcp_hdr(skb)->source, tsoff);
103 }
104 
105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106 {
107 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 	struct tcp_sock *tp = tcp_sk(sk);
109 
110 	/* With PAWS, it is safe from the viewpoint
111 	   of data integrity. Even without PAWS it is safe provided sequence
112 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113 
114 	   Actually, the idea is close to VJ's one, only timestamp cache is
115 	   held not per host, but per port pair and TW bucket is used as state
116 	   holder.
117 
118 	   If TW bucket has been already destroyed we fall back to VJ's scheme
119 	   and use initial timestamp retrieved from peer table.
120 	 */
121 	if (tcptw->tw_ts_recent_stamp &&
122 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
123 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
124 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 		if (tp->write_seq == 0)
126 			tp->write_seq = 1;
127 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
128 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 		sock_hold(sktw);
130 		return 1;
131 	}
132 
133 	return 0;
134 }
135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136 
137 /* This will initiate an outgoing connection. */
138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139 {
140 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
141 	struct inet_sock *inet = inet_sk(sk);
142 	struct tcp_sock *tp = tcp_sk(sk);
143 	__be16 orig_sport, orig_dport;
144 	__be32 daddr, nexthop;
145 	struct flowi4 *fl4;
146 	struct rtable *rt;
147 	int err;
148 	u32 seq;
149 	struct ip_options_rcu *inet_opt;
150 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     lockdep_sock_is_held(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row->sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 	rt = NULL;
237 
238 	if (likely(!tp->repair)) {
239 		seq = secure_tcp_sequence_number(inet->inet_saddr,
240 						 inet->inet_daddr,
241 						 inet->inet_sport,
242 						 usin->sin_port,
243 						 &tp->tsoffset);
244 		if (!tp->write_seq)
245 			tp->write_seq = seq;
246 	}
247 
248 	inet->inet_id = tp->write_seq ^ jiffies;
249 
250 	if (tcp_fastopen_defer_connect(sk, &err))
251 		return err;
252 	if (err)
253 		goto failure;
254 
255 	err = tcp_connect(sk);
256 
257 	if (err)
258 		goto failure;
259 
260 	return 0;
261 
262 failure:
263 	/*
264 	 * This unhashes the socket and releases the local port,
265 	 * if necessary.
266 	 */
267 	tcp_set_state(sk, TCP_CLOSE);
268 	ip_rt_put(rt);
269 	sk->sk_route_caps = 0;
270 	inet->inet_dport = 0;
271 	return err;
272 }
273 EXPORT_SYMBOL(tcp_v4_connect);
274 
275 /*
276  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
277  * It can be called through tcp_release_cb() if socket was owned by user
278  * at the time tcp_v4_err() was called to handle ICMP message.
279  */
280 void tcp_v4_mtu_reduced(struct sock *sk)
281 {
282 	struct inet_sock *inet = inet_sk(sk);
283 	struct dst_entry *dst;
284 	u32 mtu;
285 
286 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287 		return;
288 	mtu = tcp_sk(sk)->mtu_info;
289 	dst = inet_csk_update_pmtu(sk, mtu);
290 	if (!dst)
291 		return;
292 
293 	/* Something is about to be wrong... Remember soft error
294 	 * for the case, if this connection will not able to recover.
295 	 */
296 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
297 		sk->sk_err_soft = EMSGSIZE;
298 
299 	mtu = dst_mtu(dst);
300 
301 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
302 	    ip_sk_accept_pmtu(sk) &&
303 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304 		tcp_sync_mss(sk, mtu);
305 
306 		/* Resend the TCP packet because it's
307 		 * clear that the old packet has been
308 		 * dropped. This is the new "fast" path mtu
309 		 * discovery.
310 		 */
311 		tcp_simple_retransmit(sk);
312 	} /* else let the usual retransmit timer handle it */
313 }
314 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
315 
316 static void do_redirect(struct sk_buff *skb, struct sock *sk)
317 {
318 	struct dst_entry *dst = __sk_dst_check(sk, 0);
319 
320 	if (dst)
321 		dst->ops->redirect(dst, sk, skb);
322 }
323 
324 
325 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
326 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
327 {
328 	struct request_sock *req = inet_reqsk(sk);
329 	struct net *net = sock_net(sk);
330 
331 	/* ICMPs are not backlogged, hence we cannot get
332 	 * an established socket here.
333 	 */
334 	if (seq != tcp_rsk(req)->snt_isn) {
335 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
336 	} else if (abort) {
337 		/*
338 		 * Still in SYN_RECV, just remove it silently.
339 		 * There is no good way to pass the error to the newly
340 		 * created socket, and POSIX does not want network
341 		 * errors returned from accept().
342 		 */
343 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
344 		tcp_listendrop(req->rsk_listener);
345 	}
346 	reqsk_put(req);
347 }
348 EXPORT_SYMBOL(tcp_req_err);
349 
350 /*
351  * This routine is called by the ICMP module when it gets some
352  * sort of error condition.  If err < 0 then the socket should
353  * be closed and the error returned to the user.  If err > 0
354  * it's just the icmp type << 8 | icmp code.  After adjustment
355  * header points to the first 8 bytes of the tcp header.  We need
356  * to find the appropriate port.
357  *
358  * The locking strategy used here is very "optimistic". When
359  * someone else accesses the socket the ICMP is just dropped
360  * and for some paths there is no check at all.
361  * A more general error queue to queue errors for later handling
362  * is probably better.
363  *
364  */
365 
366 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
367 {
368 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
369 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
370 	struct inet_connection_sock *icsk;
371 	struct tcp_sock *tp;
372 	struct inet_sock *inet;
373 	const int type = icmp_hdr(icmp_skb)->type;
374 	const int code = icmp_hdr(icmp_skb)->code;
375 	struct sock *sk;
376 	struct sk_buff *skb;
377 	struct request_sock *fastopen;
378 	__u32 seq, snd_una;
379 	__u32 remaining;
380 	int err;
381 	struct net *net = dev_net(icmp_skb->dev);
382 
383 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
384 				       th->dest, iph->saddr, ntohs(th->source),
385 				       inet_iif(icmp_skb));
386 	if (!sk) {
387 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
388 		return;
389 	}
390 	if (sk->sk_state == TCP_TIME_WAIT) {
391 		inet_twsk_put(inet_twsk(sk));
392 		return;
393 	}
394 	seq = ntohl(th->seq);
395 	if (sk->sk_state == TCP_NEW_SYN_RECV)
396 		return tcp_req_err(sk, seq,
397 				  type == ICMP_PARAMETERPROB ||
398 				  type == ICMP_TIME_EXCEEDED ||
399 				  (type == ICMP_DEST_UNREACH &&
400 				   (code == ICMP_NET_UNREACH ||
401 				    code == ICMP_HOST_UNREACH)));
402 
403 	bh_lock_sock(sk);
404 	/* If too many ICMPs get dropped on busy
405 	 * servers this needs to be solved differently.
406 	 * We do take care of PMTU discovery (RFC1191) special case :
407 	 * we can receive locally generated ICMP messages while socket is held.
408 	 */
409 	if (sock_owned_by_user(sk)) {
410 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
411 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
412 	}
413 	if (sk->sk_state == TCP_CLOSE)
414 		goto out;
415 
416 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
417 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
418 		goto out;
419 	}
420 
421 	icsk = inet_csk(sk);
422 	tp = tcp_sk(sk);
423 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
424 	fastopen = tp->fastopen_rsk;
425 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
426 	if (sk->sk_state != TCP_LISTEN &&
427 	    !between(seq, snd_una, tp->snd_nxt)) {
428 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
429 		goto out;
430 	}
431 
432 	switch (type) {
433 	case ICMP_REDIRECT:
434 		if (!sock_owned_by_user(sk))
435 			do_redirect(icmp_skb, sk);
436 		goto out;
437 	case ICMP_SOURCE_QUENCH:
438 		/* Just silently ignore these. */
439 		goto out;
440 	case ICMP_PARAMETERPROB:
441 		err = EPROTO;
442 		break;
443 	case ICMP_DEST_UNREACH:
444 		if (code > NR_ICMP_UNREACH)
445 			goto out;
446 
447 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
448 			/* We are not interested in TCP_LISTEN and open_requests
449 			 * (SYN-ACKs send out by Linux are always <576bytes so
450 			 * they should go through unfragmented).
451 			 */
452 			if (sk->sk_state == TCP_LISTEN)
453 				goto out;
454 
455 			tp->mtu_info = info;
456 			if (!sock_owned_by_user(sk)) {
457 				tcp_v4_mtu_reduced(sk);
458 			} else {
459 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
460 					sock_hold(sk);
461 			}
462 			goto out;
463 		}
464 
465 		err = icmp_err_convert[code].errno;
466 		/* check if icmp_skb allows revert of backoff
467 		 * (see draft-zimmermann-tcp-lcd) */
468 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
469 			break;
470 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
471 		    !icsk->icsk_backoff || fastopen)
472 			break;
473 
474 		if (sock_owned_by_user(sk))
475 			break;
476 
477 		icsk->icsk_backoff--;
478 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
479 					       TCP_TIMEOUT_INIT;
480 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
481 
482 		skb = tcp_write_queue_head(sk);
483 		BUG_ON(!skb);
484 
485 		remaining = icsk->icsk_rto -
486 			    min(icsk->icsk_rto,
487 				tcp_time_stamp - tcp_skb_timestamp(skb));
488 
489 		if (remaining) {
490 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
491 						  remaining, TCP_RTO_MAX);
492 		} else {
493 			/* RTO revert clocked out retransmission.
494 			 * Will retransmit now */
495 			tcp_retransmit_timer(sk);
496 		}
497 
498 		break;
499 	case ICMP_TIME_EXCEEDED:
500 		err = EHOSTUNREACH;
501 		break;
502 	default:
503 		goto out;
504 	}
505 
506 	switch (sk->sk_state) {
507 	case TCP_SYN_SENT:
508 	case TCP_SYN_RECV:
509 		/* Only in fast or simultaneous open. If a fast open socket is
510 		 * is already accepted it is treated as a connected one below.
511 		 */
512 		if (fastopen && !fastopen->sk)
513 			break;
514 
515 		if (!sock_owned_by_user(sk)) {
516 			sk->sk_err = err;
517 
518 			sk->sk_error_report(sk);
519 
520 			tcp_done(sk);
521 		} else {
522 			sk->sk_err_soft = err;
523 		}
524 		goto out;
525 	}
526 
527 	/* If we've already connected we will keep trying
528 	 * until we time out, or the user gives up.
529 	 *
530 	 * rfc1122 4.2.3.9 allows to consider as hard errors
531 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
532 	 * but it is obsoleted by pmtu discovery).
533 	 *
534 	 * Note, that in modern internet, where routing is unreliable
535 	 * and in each dark corner broken firewalls sit, sending random
536 	 * errors ordered by their masters even this two messages finally lose
537 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
538 	 *
539 	 * Now we are in compliance with RFCs.
540 	 *							--ANK (980905)
541 	 */
542 
543 	inet = inet_sk(sk);
544 	if (!sock_owned_by_user(sk) && inet->recverr) {
545 		sk->sk_err = err;
546 		sk->sk_error_report(sk);
547 	} else	{ /* Only an error on timeout */
548 		sk->sk_err_soft = err;
549 	}
550 
551 out:
552 	bh_unlock_sock(sk);
553 	sock_put(sk);
554 }
555 
556 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
557 {
558 	struct tcphdr *th = tcp_hdr(skb);
559 
560 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
561 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
562 		skb->csum_start = skb_transport_header(skb) - skb->head;
563 		skb->csum_offset = offsetof(struct tcphdr, check);
564 	} else {
565 		th->check = tcp_v4_check(skb->len, saddr, daddr,
566 					 csum_partial(th,
567 						      th->doff << 2,
568 						      skb->csum));
569 	}
570 }
571 
572 /* This routine computes an IPv4 TCP checksum. */
573 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
574 {
575 	const struct inet_sock *inet = inet_sk(sk);
576 
577 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
578 }
579 EXPORT_SYMBOL(tcp_v4_send_check);
580 
581 /*
582  *	This routine will send an RST to the other tcp.
583  *
584  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
585  *		      for reset.
586  *	Answer: if a packet caused RST, it is not for a socket
587  *		existing in our system, if it is matched to a socket,
588  *		it is just duplicate segment or bug in other side's TCP.
589  *		So that we build reply only basing on parameters
590  *		arrived with segment.
591  *	Exception: precedence violation. We do not implement it in any case.
592  */
593 
594 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
595 {
596 	const struct tcphdr *th = tcp_hdr(skb);
597 	struct {
598 		struct tcphdr th;
599 #ifdef CONFIG_TCP_MD5SIG
600 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
601 #endif
602 	} rep;
603 	struct ip_reply_arg arg;
604 #ifdef CONFIG_TCP_MD5SIG
605 	struct tcp_md5sig_key *key = NULL;
606 	const __u8 *hash_location = NULL;
607 	unsigned char newhash[16];
608 	int genhash;
609 	struct sock *sk1 = NULL;
610 #endif
611 	struct net *net;
612 
613 	/* Never send a reset in response to a reset. */
614 	if (th->rst)
615 		return;
616 
617 	/* If sk not NULL, it means we did a successful lookup and incoming
618 	 * route had to be correct. prequeue might have dropped our dst.
619 	 */
620 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
621 		return;
622 
623 	/* Swap the send and the receive. */
624 	memset(&rep, 0, sizeof(rep));
625 	rep.th.dest   = th->source;
626 	rep.th.source = th->dest;
627 	rep.th.doff   = sizeof(struct tcphdr) / 4;
628 	rep.th.rst    = 1;
629 
630 	if (th->ack) {
631 		rep.th.seq = th->ack_seq;
632 	} else {
633 		rep.th.ack = 1;
634 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
635 				       skb->len - (th->doff << 2));
636 	}
637 
638 	memset(&arg, 0, sizeof(arg));
639 	arg.iov[0].iov_base = (unsigned char *)&rep;
640 	arg.iov[0].iov_len  = sizeof(rep.th);
641 
642 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
643 #ifdef CONFIG_TCP_MD5SIG
644 	rcu_read_lock();
645 	hash_location = tcp_parse_md5sig_option(th);
646 	if (sk && sk_fullsock(sk)) {
647 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
648 					&ip_hdr(skb)->saddr, AF_INET);
649 	} else if (hash_location) {
650 		/*
651 		 * active side is lost. Try to find listening socket through
652 		 * source port, and then find md5 key through listening socket.
653 		 * we are not loose security here:
654 		 * Incoming packet is checked with md5 hash with finding key,
655 		 * no RST generated if md5 hash doesn't match.
656 		 */
657 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
658 					     ip_hdr(skb)->saddr,
659 					     th->source, ip_hdr(skb)->daddr,
660 					     ntohs(th->source), inet_iif(skb));
661 		/* don't send rst if it can't find key */
662 		if (!sk1)
663 			goto out;
664 
665 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
666 					&ip_hdr(skb)->saddr, AF_INET);
667 		if (!key)
668 			goto out;
669 
670 
671 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
672 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
673 			goto out;
674 
675 	}
676 
677 	if (key) {
678 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
679 				   (TCPOPT_NOP << 16) |
680 				   (TCPOPT_MD5SIG << 8) |
681 				   TCPOLEN_MD5SIG);
682 		/* Update length and the length the header thinks exists */
683 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
684 		rep.th.doff = arg.iov[0].iov_len / 4;
685 
686 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
687 				     key, ip_hdr(skb)->saddr,
688 				     ip_hdr(skb)->daddr, &rep.th);
689 	}
690 #endif
691 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
692 				      ip_hdr(skb)->saddr, /* XXX */
693 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
694 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
695 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
696 
697 	/* When socket is gone, all binding information is lost.
698 	 * routing might fail in this case. No choice here, if we choose to force
699 	 * input interface, we will misroute in case of asymmetric route.
700 	 */
701 	if (sk)
702 		arg.bound_dev_if = sk->sk_bound_dev_if;
703 
704 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
705 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
706 
707 	arg.tos = ip_hdr(skb)->tos;
708 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
709 	local_bh_disable();
710 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
711 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
712 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
713 			      &arg, arg.iov[0].iov_len);
714 
715 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
716 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
717 	local_bh_enable();
718 
719 #ifdef CONFIG_TCP_MD5SIG
720 out:
721 	rcu_read_unlock();
722 #endif
723 }
724 
725 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
726    outside socket context is ugly, certainly. What can I do?
727  */
728 
729 static void tcp_v4_send_ack(const struct sock *sk,
730 			    struct sk_buff *skb, u32 seq, u32 ack,
731 			    u32 win, u32 tsval, u32 tsecr, int oif,
732 			    struct tcp_md5sig_key *key,
733 			    int reply_flags, u8 tos)
734 {
735 	const struct tcphdr *th = tcp_hdr(skb);
736 	struct {
737 		struct tcphdr th;
738 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
739 #ifdef CONFIG_TCP_MD5SIG
740 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
741 #endif
742 			];
743 	} rep;
744 	struct net *net = sock_net(sk);
745 	struct ip_reply_arg arg;
746 
747 	memset(&rep.th, 0, sizeof(struct tcphdr));
748 	memset(&arg, 0, sizeof(arg));
749 
750 	arg.iov[0].iov_base = (unsigned char *)&rep;
751 	arg.iov[0].iov_len  = sizeof(rep.th);
752 	if (tsecr) {
753 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
754 				   (TCPOPT_TIMESTAMP << 8) |
755 				   TCPOLEN_TIMESTAMP);
756 		rep.opt[1] = htonl(tsval);
757 		rep.opt[2] = htonl(tsecr);
758 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
759 	}
760 
761 	/* Swap the send and the receive. */
762 	rep.th.dest    = th->source;
763 	rep.th.source  = th->dest;
764 	rep.th.doff    = arg.iov[0].iov_len / 4;
765 	rep.th.seq     = htonl(seq);
766 	rep.th.ack_seq = htonl(ack);
767 	rep.th.ack     = 1;
768 	rep.th.window  = htons(win);
769 
770 #ifdef CONFIG_TCP_MD5SIG
771 	if (key) {
772 		int offset = (tsecr) ? 3 : 0;
773 
774 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
775 					  (TCPOPT_NOP << 16) |
776 					  (TCPOPT_MD5SIG << 8) |
777 					  TCPOLEN_MD5SIG);
778 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
779 		rep.th.doff = arg.iov[0].iov_len/4;
780 
781 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
782 				    key, ip_hdr(skb)->saddr,
783 				    ip_hdr(skb)->daddr, &rep.th);
784 	}
785 #endif
786 	arg.flags = reply_flags;
787 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
788 				      ip_hdr(skb)->saddr, /* XXX */
789 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
790 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
791 	if (oif)
792 		arg.bound_dev_if = oif;
793 	arg.tos = tos;
794 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
795 	local_bh_disable();
796 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
797 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
798 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
799 			      &arg, arg.iov[0].iov_len);
800 
801 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
802 	local_bh_enable();
803 }
804 
805 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
806 {
807 	struct inet_timewait_sock *tw = inet_twsk(sk);
808 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
809 
810 	tcp_v4_send_ack(sk, skb,
811 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
812 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
813 			tcp_time_stamp + tcptw->tw_ts_offset,
814 			tcptw->tw_ts_recent,
815 			tw->tw_bound_dev_if,
816 			tcp_twsk_md5_key(tcptw),
817 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
818 			tw->tw_tos
819 			);
820 
821 	inet_twsk_put(tw);
822 }
823 
824 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
825 				  struct request_sock *req)
826 {
827 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
828 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
829 	 */
830 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
831 					     tcp_sk(sk)->snd_nxt;
832 
833 	/* RFC 7323 2.3
834 	 * The window field (SEG.WND) of every outgoing segment, with the
835 	 * exception of <SYN> segments, MUST be right-shifted by
836 	 * Rcv.Wind.Shift bits:
837 	 */
838 	tcp_v4_send_ack(sk, skb, seq,
839 			tcp_rsk(req)->rcv_nxt,
840 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
841 			tcp_time_stamp + tcp_rsk(req)->ts_off,
842 			req->ts_recent,
843 			0,
844 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
845 					  AF_INET),
846 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
847 			ip_hdr(skb)->tos);
848 }
849 
850 /*
851  *	Send a SYN-ACK after having received a SYN.
852  *	This still operates on a request_sock only, not on a big
853  *	socket.
854  */
855 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
856 			      struct flowi *fl,
857 			      struct request_sock *req,
858 			      struct tcp_fastopen_cookie *foc,
859 			      enum tcp_synack_type synack_type)
860 {
861 	const struct inet_request_sock *ireq = inet_rsk(req);
862 	struct flowi4 fl4;
863 	int err = -1;
864 	struct sk_buff *skb;
865 
866 	/* First, grab a route. */
867 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
868 		return -1;
869 
870 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
871 
872 	if (skb) {
873 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
874 
875 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
876 					    ireq->ir_rmt_addr,
877 					    ireq->opt);
878 		err = net_xmit_eval(err);
879 	}
880 
881 	return err;
882 }
883 
884 /*
885  *	IPv4 request_sock destructor.
886  */
887 static void tcp_v4_reqsk_destructor(struct request_sock *req)
888 {
889 	kfree(inet_rsk(req)->opt);
890 }
891 
892 #ifdef CONFIG_TCP_MD5SIG
893 /*
894  * RFC2385 MD5 checksumming requires a mapping of
895  * IP address->MD5 Key.
896  * We need to maintain these in the sk structure.
897  */
898 
899 /* Find the Key structure for an address.  */
900 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
901 					 const union tcp_md5_addr *addr,
902 					 int family)
903 {
904 	const struct tcp_sock *tp = tcp_sk(sk);
905 	struct tcp_md5sig_key *key;
906 	unsigned int size = sizeof(struct in_addr);
907 	const struct tcp_md5sig_info *md5sig;
908 
909 	/* caller either holds rcu_read_lock() or socket lock */
910 	md5sig = rcu_dereference_check(tp->md5sig_info,
911 				       lockdep_sock_is_held(sk));
912 	if (!md5sig)
913 		return NULL;
914 #if IS_ENABLED(CONFIG_IPV6)
915 	if (family == AF_INET6)
916 		size = sizeof(struct in6_addr);
917 #endif
918 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
919 		if (key->family != family)
920 			continue;
921 		if (!memcmp(&key->addr, addr, size))
922 			return key;
923 	}
924 	return NULL;
925 }
926 EXPORT_SYMBOL(tcp_md5_do_lookup);
927 
928 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
929 					 const struct sock *addr_sk)
930 {
931 	const union tcp_md5_addr *addr;
932 
933 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
934 	return tcp_md5_do_lookup(sk, addr, AF_INET);
935 }
936 EXPORT_SYMBOL(tcp_v4_md5_lookup);
937 
938 /* This can be called on a newly created socket, from other files */
939 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
940 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
941 {
942 	/* Add Key to the list */
943 	struct tcp_md5sig_key *key;
944 	struct tcp_sock *tp = tcp_sk(sk);
945 	struct tcp_md5sig_info *md5sig;
946 
947 	key = tcp_md5_do_lookup(sk, addr, family);
948 	if (key) {
949 		/* Pre-existing entry - just update that one. */
950 		memcpy(key->key, newkey, newkeylen);
951 		key->keylen = newkeylen;
952 		return 0;
953 	}
954 
955 	md5sig = rcu_dereference_protected(tp->md5sig_info,
956 					   lockdep_sock_is_held(sk));
957 	if (!md5sig) {
958 		md5sig = kmalloc(sizeof(*md5sig), gfp);
959 		if (!md5sig)
960 			return -ENOMEM;
961 
962 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
963 		INIT_HLIST_HEAD(&md5sig->head);
964 		rcu_assign_pointer(tp->md5sig_info, md5sig);
965 	}
966 
967 	key = sock_kmalloc(sk, sizeof(*key), gfp);
968 	if (!key)
969 		return -ENOMEM;
970 	if (!tcp_alloc_md5sig_pool()) {
971 		sock_kfree_s(sk, key, sizeof(*key));
972 		return -ENOMEM;
973 	}
974 
975 	memcpy(key->key, newkey, newkeylen);
976 	key->keylen = newkeylen;
977 	key->family = family;
978 	memcpy(&key->addr, addr,
979 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
980 				      sizeof(struct in_addr));
981 	hlist_add_head_rcu(&key->node, &md5sig->head);
982 	return 0;
983 }
984 EXPORT_SYMBOL(tcp_md5_do_add);
985 
986 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
987 {
988 	struct tcp_md5sig_key *key;
989 
990 	key = tcp_md5_do_lookup(sk, addr, family);
991 	if (!key)
992 		return -ENOENT;
993 	hlist_del_rcu(&key->node);
994 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
995 	kfree_rcu(key, rcu);
996 	return 0;
997 }
998 EXPORT_SYMBOL(tcp_md5_do_del);
999 
1000 static void tcp_clear_md5_list(struct sock *sk)
1001 {
1002 	struct tcp_sock *tp = tcp_sk(sk);
1003 	struct tcp_md5sig_key *key;
1004 	struct hlist_node *n;
1005 	struct tcp_md5sig_info *md5sig;
1006 
1007 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1008 
1009 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1010 		hlist_del_rcu(&key->node);
1011 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1012 		kfree_rcu(key, rcu);
1013 	}
1014 }
1015 
1016 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1017 				 int optlen)
1018 {
1019 	struct tcp_md5sig cmd;
1020 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1021 
1022 	if (optlen < sizeof(cmd))
1023 		return -EINVAL;
1024 
1025 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1026 		return -EFAULT;
1027 
1028 	if (sin->sin_family != AF_INET)
1029 		return -EINVAL;
1030 
1031 	if (!cmd.tcpm_keylen)
1032 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1033 				      AF_INET);
1034 
1035 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1036 		return -EINVAL;
1037 
1038 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1039 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1040 			      GFP_KERNEL);
1041 }
1042 
1043 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1044 				   __be32 daddr, __be32 saddr,
1045 				   const struct tcphdr *th, int nbytes)
1046 {
1047 	struct tcp4_pseudohdr *bp;
1048 	struct scatterlist sg;
1049 	struct tcphdr *_th;
1050 
1051 	bp = hp->scratch;
1052 	bp->saddr = saddr;
1053 	bp->daddr = daddr;
1054 	bp->pad = 0;
1055 	bp->protocol = IPPROTO_TCP;
1056 	bp->len = cpu_to_be16(nbytes);
1057 
1058 	_th = (struct tcphdr *)(bp + 1);
1059 	memcpy(_th, th, sizeof(*th));
1060 	_th->check = 0;
1061 
1062 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1063 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1064 				sizeof(*bp) + sizeof(*th));
1065 	return crypto_ahash_update(hp->md5_req);
1066 }
1067 
1068 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1069 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1070 {
1071 	struct tcp_md5sig_pool *hp;
1072 	struct ahash_request *req;
1073 
1074 	hp = tcp_get_md5sig_pool();
1075 	if (!hp)
1076 		goto clear_hash_noput;
1077 	req = hp->md5_req;
1078 
1079 	if (crypto_ahash_init(req))
1080 		goto clear_hash;
1081 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1082 		goto clear_hash;
1083 	if (tcp_md5_hash_key(hp, key))
1084 		goto clear_hash;
1085 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1086 	if (crypto_ahash_final(req))
1087 		goto clear_hash;
1088 
1089 	tcp_put_md5sig_pool();
1090 	return 0;
1091 
1092 clear_hash:
1093 	tcp_put_md5sig_pool();
1094 clear_hash_noput:
1095 	memset(md5_hash, 0, 16);
1096 	return 1;
1097 }
1098 
1099 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1100 			const struct sock *sk,
1101 			const struct sk_buff *skb)
1102 {
1103 	struct tcp_md5sig_pool *hp;
1104 	struct ahash_request *req;
1105 	const struct tcphdr *th = tcp_hdr(skb);
1106 	__be32 saddr, daddr;
1107 
1108 	if (sk) { /* valid for establish/request sockets */
1109 		saddr = sk->sk_rcv_saddr;
1110 		daddr = sk->sk_daddr;
1111 	} else {
1112 		const struct iphdr *iph = ip_hdr(skb);
1113 		saddr = iph->saddr;
1114 		daddr = iph->daddr;
1115 	}
1116 
1117 	hp = tcp_get_md5sig_pool();
1118 	if (!hp)
1119 		goto clear_hash_noput;
1120 	req = hp->md5_req;
1121 
1122 	if (crypto_ahash_init(req))
1123 		goto clear_hash;
1124 
1125 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1126 		goto clear_hash;
1127 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1128 		goto clear_hash;
1129 	if (tcp_md5_hash_key(hp, key))
1130 		goto clear_hash;
1131 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1132 	if (crypto_ahash_final(req))
1133 		goto clear_hash;
1134 
1135 	tcp_put_md5sig_pool();
1136 	return 0;
1137 
1138 clear_hash:
1139 	tcp_put_md5sig_pool();
1140 clear_hash_noput:
1141 	memset(md5_hash, 0, 16);
1142 	return 1;
1143 }
1144 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1145 
1146 #endif
1147 
1148 /* Called with rcu_read_lock() */
1149 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1150 				    const struct sk_buff *skb)
1151 {
1152 #ifdef CONFIG_TCP_MD5SIG
1153 	/*
1154 	 * This gets called for each TCP segment that arrives
1155 	 * so we want to be efficient.
1156 	 * We have 3 drop cases:
1157 	 * o No MD5 hash and one expected.
1158 	 * o MD5 hash and we're not expecting one.
1159 	 * o MD5 hash and its wrong.
1160 	 */
1161 	const __u8 *hash_location = NULL;
1162 	struct tcp_md5sig_key *hash_expected;
1163 	const struct iphdr *iph = ip_hdr(skb);
1164 	const struct tcphdr *th = tcp_hdr(skb);
1165 	int genhash;
1166 	unsigned char newhash[16];
1167 
1168 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1169 					  AF_INET);
1170 	hash_location = tcp_parse_md5sig_option(th);
1171 
1172 	/* We've parsed the options - do we have a hash? */
1173 	if (!hash_expected && !hash_location)
1174 		return false;
1175 
1176 	if (hash_expected && !hash_location) {
1177 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1178 		return true;
1179 	}
1180 
1181 	if (!hash_expected && hash_location) {
1182 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1183 		return true;
1184 	}
1185 
1186 	/* Okay, so this is hash_expected and hash_location -
1187 	 * so we need to calculate the checksum.
1188 	 */
1189 	genhash = tcp_v4_md5_hash_skb(newhash,
1190 				      hash_expected,
1191 				      NULL, skb);
1192 
1193 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1194 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1195 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1196 				     &iph->saddr, ntohs(th->source),
1197 				     &iph->daddr, ntohs(th->dest),
1198 				     genhash ? " tcp_v4_calc_md5_hash failed"
1199 				     : "");
1200 		return true;
1201 	}
1202 	return false;
1203 #endif
1204 	return false;
1205 }
1206 
1207 static void tcp_v4_init_req(struct request_sock *req,
1208 			    const struct sock *sk_listener,
1209 			    struct sk_buff *skb)
1210 {
1211 	struct inet_request_sock *ireq = inet_rsk(req);
1212 
1213 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1214 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1215 	ireq->opt = tcp_v4_save_options(skb);
1216 }
1217 
1218 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1219 					  struct flowi *fl,
1220 					  const struct request_sock *req,
1221 					  bool *strict)
1222 {
1223 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1224 
1225 	if (strict) {
1226 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1227 			*strict = true;
1228 		else
1229 			*strict = false;
1230 	}
1231 
1232 	return dst;
1233 }
1234 
1235 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1236 	.family		=	PF_INET,
1237 	.obj_size	=	sizeof(struct tcp_request_sock),
1238 	.rtx_syn_ack	=	tcp_rtx_synack,
1239 	.send_ack	=	tcp_v4_reqsk_send_ack,
1240 	.destructor	=	tcp_v4_reqsk_destructor,
1241 	.send_reset	=	tcp_v4_send_reset,
1242 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1243 };
1244 
1245 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1246 	.mss_clamp	=	TCP_MSS_DEFAULT,
1247 #ifdef CONFIG_TCP_MD5SIG
1248 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1249 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1250 #endif
1251 	.init_req	=	tcp_v4_init_req,
1252 #ifdef CONFIG_SYN_COOKIES
1253 	.cookie_init_seq =	cookie_v4_init_sequence,
1254 #endif
1255 	.route_req	=	tcp_v4_route_req,
1256 	.init_seq	=	tcp_v4_init_sequence,
1257 	.send_synack	=	tcp_v4_send_synack,
1258 };
1259 
1260 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1261 {
1262 	/* Never answer to SYNs send to broadcast or multicast */
1263 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1264 		goto drop;
1265 
1266 	return tcp_conn_request(&tcp_request_sock_ops,
1267 				&tcp_request_sock_ipv4_ops, sk, skb);
1268 
1269 drop:
1270 	tcp_listendrop(sk);
1271 	return 0;
1272 }
1273 EXPORT_SYMBOL(tcp_v4_conn_request);
1274 
1275 
1276 /*
1277  * The three way handshake has completed - we got a valid synack -
1278  * now create the new socket.
1279  */
1280 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1281 				  struct request_sock *req,
1282 				  struct dst_entry *dst,
1283 				  struct request_sock *req_unhash,
1284 				  bool *own_req)
1285 {
1286 	struct inet_request_sock *ireq;
1287 	struct inet_sock *newinet;
1288 	struct tcp_sock *newtp;
1289 	struct sock *newsk;
1290 #ifdef CONFIG_TCP_MD5SIG
1291 	struct tcp_md5sig_key *key;
1292 #endif
1293 	struct ip_options_rcu *inet_opt;
1294 
1295 	if (sk_acceptq_is_full(sk))
1296 		goto exit_overflow;
1297 
1298 	newsk = tcp_create_openreq_child(sk, req, skb);
1299 	if (!newsk)
1300 		goto exit_nonewsk;
1301 
1302 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1303 	inet_sk_rx_dst_set(newsk, skb);
1304 
1305 	newtp		      = tcp_sk(newsk);
1306 	newinet		      = inet_sk(newsk);
1307 	ireq		      = inet_rsk(req);
1308 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1309 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1310 	newsk->sk_bound_dev_if = ireq->ir_iif;
1311 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1312 	inet_opt	      = ireq->opt;
1313 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1314 	ireq->opt	      = NULL;
1315 	newinet->mc_index     = inet_iif(skb);
1316 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1317 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1318 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1319 	if (inet_opt)
1320 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1321 	newinet->inet_id = newtp->write_seq ^ jiffies;
1322 
1323 	if (!dst) {
1324 		dst = inet_csk_route_child_sock(sk, newsk, req);
1325 		if (!dst)
1326 			goto put_and_exit;
1327 	} else {
1328 		/* syncookie case : see end of cookie_v4_check() */
1329 	}
1330 	sk_setup_caps(newsk, dst);
1331 
1332 	tcp_ca_openreq_child(newsk, dst);
1333 
1334 	tcp_sync_mss(newsk, dst_mtu(dst));
1335 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1336 
1337 	tcp_initialize_rcv_mss(newsk);
1338 
1339 #ifdef CONFIG_TCP_MD5SIG
1340 	/* Copy over the MD5 key from the original socket */
1341 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1342 				AF_INET);
1343 	if (key) {
1344 		/*
1345 		 * We're using one, so create a matching key
1346 		 * on the newsk structure. If we fail to get
1347 		 * memory, then we end up not copying the key
1348 		 * across. Shucks.
1349 		 */
1350 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1351 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1352 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1353 	}
1354 #endif
1355 
1356 	if (__inet_inherit_port(sk, newsk) < 0)
1357 		goto put_and_exit;
1358 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1359 	if (*own_req)
1360 		tcp_move_syn(newtp, req);
1361 
1362 	return newsk;
1363 
1364 exit_overflow:
1365 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1366 exit_nonewsk:
1367 	dst_release(dst);
1368 exit:
1369 	tcp_listendrop(sk);
1370 	return NULL;
1371 put_and_exit:
1372 	inet_csk_prepare_forced_close(newsk);
1373 	tcp_done(newsk);
1374 	goto exit;
1375 }
1376 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1377 
1378 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1379 {
1380 #ifdef CONFIG_SYN_COOKIES
1381 	const struct tcphdr *th = tcp_hdr(skb);
1382 
1383 	if (!th->syn)
1384 		sk = cookie_v4_check(sk, skb);
1385 #endif
1386 	return sk;
1387 }
1388 
1389 /* The socket must have it's spinlock held when we get
1390  * here, unless it is a TCP_LISTEN socket.
1391  *
1392  * We have a potential double-lock case here, so even when
1393  * doing backlog processing we use the BH locking scheme.
1394  * This is because we cannot sleep with the original spinlock
1395  * held.
1396  */
1397 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1398 {
1399 	struct sock *rsk;
1400 
1401 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1402 		struct dst_entry *dst = sk->sk_rx_dst;
1403 
1404 		sock_rps_save_rxhash(sk, skb);
1405 		sk_mark_napi_id(sk, skb);
1406 		if (dst) {
1407 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1408 			    !dst->ops->check(dst, 0)) {
1409 				dst_release(dst);
1410 				sk->sk_rx_dst = NULL;
1411 			}
1412 		}
1413 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1414 		return 0;
1415 	}
1416 
1417 	if (tcp_checksum_complete(skb))
1418 		goto csum_err;
1419 
1420 	if (sk->sk_state == TCP_LISTEN) {
1421 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1422 
1423 		if (!nsk)
1424 			goto discard;
1425 		if (nsk != sk) {
1426 			sock_rps_save_rxhash(nsk, skb);
1427 			sk_mark_napi_id(nsk, skb);
1428 			if (tcp_child_process(sk, nsk, skb)) {
1429 				rsk = nsk;
1430 				goto reset;
1431 			}
1432 			return 0;
1433 		}
1434 	} else
1435 		sock_rps_save_rxhash(sk, skb);
1436 
1437 	if (tcp_rcv_state_process(sk, skb)) {
1438 		rsk = sk;
1439 		goto reset;
1440 	}
1441 	return 0;
1442 
1443 reset:
1444 	tcp_v4_send_reset(rsk, skb);
1445 discard:
1446 	kfree_skb(skb);
1447 	/* Be careful here. If this function gets more complicated and
1448 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1449 	 * might be destroyed here. This current version compiles correctly,
1450 	 * but you have been warned.
1451 	 */
1452 	return 0;
1453 
1454 csum_err:
1455 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1456 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1457 	goto discard;
1458 }
1459 EXPORT_SYMBOL(tcp_v4_do_rcv);
1460 
1461 void tcp_v4_early_demux(struct sk_buff *skb)
1462 {
1463 	const struct iphdr *iph;
1464 	const struct tcphdr *th;
1465 	struct sock *sk;
1466 
1467 	if (skb->pkt_type != PACKET_HOST)
1468 		return;
1469 
1470 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1471 		return;
1472 
1473 	iph = ip_hdr(skb);
1474 	th = tcp_hdr(skb);
1475 
1476 	if (th->doff < sizeof(struct tcphdr) / 4)
1477 		return;
1478 
1479 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1480 				       iph->saddr, th->source,
1481 				       iph->daddr, ntohs(th->dest),
1482 				       skb->skb_iif);
1483 	if (sk) {
1484 		skb->sk = sk;
1485 		skb->destructor = sock_edemux;
1486 		if (sk_fullsock(sk)) {
1487 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1488 
1489 			if (dst)
1490 				dst = dst_check(dst, 0);
1491 			if (dst &&
1492 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1493 				skb_dst_set_noref(skb, dst);
1494 		}
1495 	}
1496 }
1497 
1498 /* Packet is added to VJ-style prequeue for processing in process
1499  * context, if a reader task is waiting. Apparently, this exciting
1500  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1501  * failed somewhere. Latency? Burstiness? Well, at least now we will
1502  * see, why it failed. 8)8)				  --ANK
1503  *
1504  */
1505 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1506 {
1507 	struct tcp_sock *tp = tcp_sk(sk);
1508 
1509 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1510 		return false;
1511 
1512 	if (skb->len <= tcp_hdrlen(skb) &&
1513 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1514 		return false;
1515 
1516 	/* Before escaping RCU protected region, we need to take care of skb
1517 	 * dst. Prequeue is only enabled for established sockets.
1518 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1519 	 * Instead of doing full sk_rx_dst validity here, let's perform
1520 	 * an optimistic check.
1521 	 */
1522 	if (likely(sk->sk_rx_dst))
1523 		skb_dst_drop(skb);
1524 	else
1525 		skb_dst_force_safe(skb);
1526 
1527 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1528 	tp->ucopy.memory += skb->truesize;
1529 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1530 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1531 		struct sk_buff *skb1;
1532 
1533 		BUG_ON(sock_owned_by_user(sk));
1534 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1535 				skb_queue_len(&tp->ucopy.prequeue));
1536 
1537 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1538 			sk_backlog_rcv(sk, skb1);
1539 
1540 		tp->ucopy.memory = 0;
1541 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1542 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1543 					   POLLIN | POLLRDNORM | POLLRDBAND);
1544 		if (!inet_csk_ack_scheduled(sk))
1545 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1546 						  (3 * tcp_rto_min(sk)) / 4,
1547 						  TCP_RTO_MAX);
1548 	}
1549 	return true;
1550 }
1551 EXPORT_SYMBOL(tcp_prequeue);
1552 
1553 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554 {
1555 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556 
1557 	/* Only socket owner can try to collapse/prune rx queues
1558 	 * to reduce memory overhead, so add a little headroom here.
1559 	 * Few sockets backlog are possibly concurrently non empty.
1560 	 */
1561 	limit += 64*1024;
1562 
1563 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564 	 * we can fix skb->truesize to its real value to avoid future drops.
1565 	 * This is valid because skb is not yet charged to the socket.
1566 	 * It has been noticed pure SACK packets were sometimes dropped
1567 	 * (if cooked by drivers without copybreak feature).
1568 	 */
1569 	skb_condense(skb);
1570 
1571 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572 		bh_unlock_sock(sk);
1573 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574 		return true;
1575 	}
1576 	return false;
1577 }
1578 EXPORT_SYMBOL(tcp_add_backlog);
1579 
1580 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581 {
1582 	struct tcphdr *th = (struct tcphdr *)skb->data;
1583 	unsigned int eaten = skb->len;
1584 	int err;
1585 
1586 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587 	if (!err) {
1588 		eaten -= skb->len;
1589 		TCP_SKB_CB(skb)->end_seq -= eaten;
1590 	}
1591 	return err;
1592 }
1593 EXPORT_SYMBOL(tcp_filter);
1594 
1595 /*
1596  *	From tcp_input.c
1597  */
1598 
1599 int tcp_v4_rcv(struct sk_buff *skb)
1600 {
1601 	struct net *net = dev_net(skb->dev);
1602 	const struct iphdr *iph;
1603 	const struct tcphdr *th;
1604 	bool refcounted;
1605 	struct sock *sk;
1606 	int ret;
1607 
1608 	if (skb->pkt_type != PACKET_HOST)
1609 		goto discard_it;
1610 
1611 	/* Count it even if it's bad */
1612 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1613 
1614 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1615 		goto discard_it;
1616 
1617 	th = (const struct tcphdr *)skb->data;
1618 
1619 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1620 		goto bad_packet;
1621 	if (!pskb_may_pull(skb, th->doff * 4))
1622 		goto discard_it;
1623 
1624 	/* An explanation is required here, I think.
1625 	 * Packet length and doff are validated by header prediction,
1626 	 * provided case of th->doff==0 is eliminated.
1627 	 * So, we defer the checks. */
1628 
1629 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1630 		goto csum_error;
1631 
1632 	th = (const struct tcphdr *)skb->data;
1633 	iph = ip_hdr(skb);
1634 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1635 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1636 	 */
1637 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1638 		sizeof(struct inet_skb_parm));
1639 	barrier();
1640 
1641 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1642 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1643 				    skb->len - th->doff * 4);
1644 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1645 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1646 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1647 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1648 	TCP_SKB_CB(skb)->sacked	 = 0;
1649 
1650 lookup:
1651 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1652 			       th->dest, &refcounted);
1653 	if (!sk)
1654 		goto no_tcp_socket;
1655 
1656 process:
1657 	if (sk->sk_state == TCP_TIME_WAIT)
1658 		goto do_time_wait;
1659 
1660 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1661 		struct request_sock *req = inet_reqsk(sk);
1662 		struct sock *nsk;
1663 
1664 		sk = req->rsk_listener;
1665 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1666 			sk_drops_add(sk, skb);
1667 			reqsk_put(req);
1668 			goto discard_it;
1669 		}
1670 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1671 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1672 			goto lookup;
1673 		}
1674 		/* We own a reference on the listener, increase it again
1675 		 * as we might lose it too soon.
1676 		 */
1677 		sock_hold(sk);
1678 		refcounted = true;
1679 		nsk = tcp_check_req(sk, skb, req, false);
1680 		if (!nsk) {
1681 			reqsk_put(req);
1682 			goto discard_and_relse;
1683 		}
1684 		if (nsk == sk) {
1685 			reqsk_put(req);
1686 		} else if (tcp_child_process(sk, nsk, skb)) {
1687 			tcp_v4_send_reset(nsk, skb);
1688 			goto discard_and_relse;
1689 		} else {
1690 			sock_put(sk);
1691 			return 0;
1692 		}
1693 	}
1694 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1695 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1696 		goto discard_and_relse;
1697 	}
1698 
1699 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1700 		goto discard_and_relse;
1701 
1702 	if (tcp_v4_inbound_md5_hash(sk, skb))
1703 		goto discard_and_relse;
1704 
1705 	nf_reset(skb);
1706 
1707 	if (tcp_filter(sk, skb))
1708 		goto discard_and_relse;
1709 	th = (const struct tcphdr *)skb->data;
1710 	iph = ip_hdr(skb);
1711 
1712 	skb->dev = NULL;
1713 
1714 	if (sk->sk_state == TCP_LISTEN) {
1715 		ret = tcp_v4_do_rcv(sk, skb);
1716 		goto put_and_return;
1717 	}
1718 
1719 	sk_incoming_cpu_update(sk);
1720 
1721 	bh_lock_sock_nested(sk);
1722 	tcp_segs_in(tcp_sk(sk), skb);
1723 	ret = 0;
1724 	if (!sock_owned_by_user(sk)) {
1725 		if (!tcp_prequeue(sk, skb))
1726 			ret = tcp_v4_do_rcv(sk, skb);
1727 	} else if (tcp_add_backlog(sk, skb)) {
1728 		goto discard_and_relse;
1729 	}
1730 	bh_unlock_sock(sk);
1731 
1732 put_and_return:
1733 	if (refcounted)
1734 		sock_put(sk);
1735 
1736 	return ret;
1737 
1738 no_tcp_socket:
1739 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1740 		goto discard_it;
1741 
1742 	if (tcp_checksum_complete(skb)) {
1743 csum_error:
1744 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1745 bad_packet:
1746 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1747 	} else {
1748 		tcp_v4_send_reset(NULL, skb);
1749 	}
1750 
1751 discard_it:
1752 	/* Discard frame. */
1753 	kfree_skb(skb);
1754 	return 0;
1755 
1756 discard_and_relse:
1757 	sk_drops_add(sk, skb);
1758 	if (refcounted)
1759 		sock_put(sk);
1760 	goto discard_it;
1761 
1762 do_time_wait:
1763 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1764 		inet_twsk_put(inet_twsk(sk));
1765 		goto discard_it;
1766 	}
1767 
1768 	if (tcp_checksum_complete(skb)) {
1769 		inet_twsk_put(inet_twsk(sk));
1770 		goto csum_error;
1771 	}
1772 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1773 	case TCP_TW_SYN: {
1774 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1775 							&tcp_hashinfo, skb,
1776 							__tcp_hdrlen(th),
1777 							iph->saddr, th->source,
1778 							iph->daddr, th->dest,
1779 							inet_iif(skb));
1780 		if (sk2) {
1781 			inet_twsk_deschedule_put(inet_twsk(sk));
1782 			sk = sk2;
1783 			refcounted = false;
1784 			goto process;
1785 		}
1786 		/* Fall through to ACK */
1787 	}
1788 	case TCP_TW_ACK:
1789 		tcp_v4_timewait_ack(sk, skb);
1790 		break;
1791 	case TCP_TW_RST:
1792 		tcp_v4_send_reset(sk, skb);
1793 		inet_twsk_deschedule_put(inet_twsk(sk));
1794 		goto discard_it;
1795 	case TCP_TW_SUCCESS:;
1796 	}
1797 	goto discard_it;
1798 }
1799 
1800 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1801 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1802 	.twsk_unique	= tcp_twsk_unique,
1803 	.twsk_destructor= tcp_twsk_destructor,
1804 };
1805 
1806 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1807 {
1808 	struct dst_entry *dst = skb_dst(skb);
1809 
1810 	if (dst && dst_hold_safe(dst)) {
1811 		sk->sk_rx_dst = dst;
1812 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1813 	}
1814 }
1815 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1816 
1817 const struct inet_connection_sock_af_ops ipv4_specific = {
1818 	.queue_xmit	   = ip_queue_xmit,
1819 	.send_check	   = tcp_v4_send_check,
1820 	.rebuild_header	   = inet_sk_rebuild_header,
1821 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1822 	.conn_request	   = tcp_v4_conn_request,
1823 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1824 	.net_header_len	   = sizeof(struct iphdr),
1825 	.setsockopt	   = ip_setsockopt,
1826 	.getsockopt	   = ip_getsockopt,
1827 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1828 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1829 #ifdef CONFIG_COMPAT
1830 	.compat_setsockopt = compat_ip_setsockopt,
1831 	.compat_getsockopt = compat_ip_getsockopt,
1832 #endif
1833 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1834 };
1835 EXPORT_SYMBOL(ipv4_specific);
1836 
1837 #ifdef CONFIG_TCP_MD5SIG
1838 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1839 	.md5_lookup		= tcp_v4_md5_lookup,
1840 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1841 	.md5_parse		= tcp_v4_parse_md5_keys,
1842 };
1843 #endif
1844 
1845 /* NOTE: A lot of things set to zero explicitly by call to
1846  *       sk_alloc() so need not be done here.
1847  */
1848 static int tcp_v4_init_sock(struct sock *sk)
1849 {
1850 	struct inet_connection_sock *icsk = inet_csk(sk);
1851 
1852 	tcp_init_sock(sk);
1853 
1854 	icsk->icsk_af_ops = &ipv4_specific;
1855 
1856 #ifdef CONFIG_TCP_MD5SIG
1857 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1858 #endif
1859 
1860 	return 0;
1861 }
1862 
1863 void tcp_v4_destroy_sock(struct sock *sk)
1864 {
1865 	struct tcp_sock *tp = tcp_sk(sk);
1866 
1867 	tcp_clear_xmit_timers(sk);
1868 
1869 	tcp_cleanup_congestion_control(sk);
1870 
1871 	/* Cleanup up the write buffer. */
1872 	tcp_write_queue_purge(sk);
1873 
1874 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1875 	skb_rbtree_purge(&tp->out_of_order_queue);
1876 
1877 #ifdef CONFIG_TCP_MD5SIG
1878 	/* Clean up the MD5 key list, if any */
1879 	if (tp->md5sig_info) {
1880 		tcp_clear_md5_list(sk);
1881 		kfree_rcu(tp->md5sig_info, rcu);
1882 		tp->md5sig_info = NULL;
1883 	}
1884 #endif
1885 
1886 	/* Clean prequeue, it must be empty really */
1887 	__skb_queue_purge(&tp->ucopy.prequeue);
1888 
1889 	/* Clean up a referenced TCP bind bucket. */
1890 	if (inet_csk(sk)->icsk_bind_hash)
1891 		inet_put_port(sk);
1892 
1893 	BUG_ON(tp->fastopen_rsk);
1894 
1895 	/* If socket is aborted during connect operation */
1896 	tcp_free_fastopen_req(tp);
1897 	tcp_saved_syn_free(tp);
1898 
1899 	sk_sockets_allocated_dec(sk);
1900 }
1901 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1902 
1903 #ifdef CONFIG_PROC_FS
1904 /* Proc filesystem TCP sock list dumping. */
1905 
1906 /*
1907  * Get next listener socket follow cur.  If cur is NULL, get first socket
1908  * starting from bucket given in st->bucket; when st->bucket is zero the
1909  * very first socket in the hash table is returned.
1910  */
1911 static void *listening_get_next(struct seq_file *seq, void *cur)
1912 {
1913 	struct tcp_iter_state *st = seq->private;
1914 	struct net *net = seq_file_net(seq);
1915 	struct inet_listen_hashbucket *ilb;
1916 	struct sock *sk = cur;
1917 
1918 	if (!sk) {
1919 get_head:
1920 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1921 		spin_lock(&ilb->lock);
1922 		sk = sk_head(&ilb->head);
1923 		st->offset = 0;
1924 		goto get_sk;
1925 	}
1926 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1927 	++st->num;
1928 	++st->offset;
1929 
1930 	sk = sk_next(sk);
1931 get_sk:
1932 	sk_for_each_from(sk) {
1933 		if (!net_eq(sock_net(sk), net))
1934 			continue;
1935 		if (sk->sk_family == st->family)
1936 			return sk;
1937 	}
1938 	spin_unlock(&ilb->lock);
1939 	st->offset = 0;
1940 	if (++st->bucket < INET_LHTABLE_SIZE)
1941 		goto get_head;
1942 	return NULL;
1943 }
1944 
1945 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946 {
1947 	struct tcp_iter_state *st = seq->private;
1948 	void *rc;
1949 
1950 	st->bucket = 0;
1951 	st->offset = 0;
1952 	rc = listening_get_next(seq, NULL);
1953 
1954 	while (rc && *pos) {
1955 		rc = listening_get_next(seq, rc);
1956 		--*pos;
1957 	}
1958 	return rc;
1959 }
1960 
1961 static inline bool empty_bucket(const struct tcp_iter_state *st)
1962 {
1963 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1964 }
1965 
1966 /*
1967  * Get first established socket starting from bucket given in st->bucket.
1968  * If st->bucket is zero, the very first socket in the hash is returned.
1969  */
1970 static void *established_get_first(struct seq_file *seq)
1971 {
1972 	struct tcp_iter_state *st = seq->private;
1973 	struct net *net = seq_file_net(seq);
1974 	void *rc = NULL;
1975 
1976 	st->offset = 0;
1977 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1978 		struct sock *sk;
1979 		struct hlist_nulls_node *node;
1980 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1981 
1982 		/* Lockless fast path for the common case of empty buckets */
1983 		if (empty_bucket(st))
1984 			continue;
1985 
1986 		spin_lock_bh(lock);
1987 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1988 			if (sk->sk_family != st->family ||
1989 			    !net_eq(sock_net(sk), net)) {
1990 				continue;
1991 			}
1992 			rc = sk;
1993 			goto out;
1994 		}
1995 		spin_unlock_bh(lock);
1996 	}
1997 out:
1998 	return rc;
1999 }
2000 
2001 static void *established_get_next(struct seq_file *seq, void *cur)
2002 {
2003 	struct sock *sk = cur;
2004 	struct hlist_nulls_node *node;
2005 	struct tcp_iter_state *st = seq->private;
2006 	struct net *net = seq_file_net(seq);
2007 
2008 	++st->num;
2009 	++st->offset;
2010 
2011 	sk = sk_nulls_next(sk);
2012 
2013 	sk_nulls_for_each_from(sk, node) {
2014 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2015 			return sk;
2016 	}
2017 
2018 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2019 	++st->bucket;
2020 	return established_get_first(seq);
2021 }
2022 
2023 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2024 {
2025 	struct tcp_iter_state *st = seq->private;
2026 	void *rc;
2027 
2028 	st->bucket = 0;
2029 	rc = established_get_first(seq);
2030 
2031 	while (rc && pos) {
2032 		rc = established_get_next(seq, rc);
2033 		--pos;
2034 	}
2035 	return rc;
2036 }
2037 
2038 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2039 {
2040 	void *rc;
2041 	struct tcp_iter_state *st = seq->private;
2042 
2043 	st->state = TCP_SEQ_STATE_LISTENING;
2044 	rc	  = listening_get_idx(seq, &pos);
2045 
2046 	if (!rc) {
2047 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2048 		rc	  = established_get_idx(seq, pos);
2049 	}
2050 
2051 	return rc;
2052 }
2053 
2054 static void *tcp_seek_last_pos(struct seq_file *seq)
2055 {
2056 	struct tcp_iter_state *st = seq->private;
2057 	int offset = st->offset;
2058 	int orig_num = st->num;
2059 	void *rc = NULL;
2060 
2061 	switch (st->state) {
2062 	case TCP_SEQ_STATE_LISTENING:
2063 		if (st->bucket >= INET_LHTABLE_SIZE)
2064 			break;
2065 		st->state = TCP_SEQ_STATE_LISTENING;
2066 		rc = listening_get_next(seq, NULL);
2067 		while (offset-- && rc)
2068 			rc = listening_get_next(seq, rc);
2069 		if (rc)
2070 			break;
2071 		st->bucket = 0;
2072 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2073 		/* Fallthrough */
2074 	case TCP_SEQ_STATE_ESTABLISHED:
2075 		if (st->bucket > tcp_hashinfo.ehash_mask)
2076 			break;
2077 		rc = established_get_first(seq);
2078 		while (offset-- && rc)
2079 			rc = established_get_next(seq, rc);
2080 	}
2081 
2082 	st->num = orig_num;
2083 
2084 	return rc;
2085 }
2086 
2087 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2088 {
2089 	struct tcp_iter_state *st = seq->private;
2090 	void *rc;
2091 
2092 	if (*pos && *pos == st->last_pos) {
2093 		rc = tcp_seek_last_pos(seq);
2094 		if (rc)
2095 			goto out;
2096 	}
2097 
2098 	st->state = TCP_SEQ_STATE_LISTENING;
2099 	st->num = 0;
2100 	st->bucket = 0;
2101 	st->offset = 0;
2102 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2103 
2104 out:
2105 	st->last_pos = *pos;
2106 	return rc;
2107 }
2108 
2109 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2110 {
2111 	struct tcp_iter_state *st = seq->private;
2112 	void *rc = NULL;
2113 
2114 	if (v == SEQ_START_TOKEN) {
2115 		rc = tcp_get_idx(seq, 0);
2116 		goto out;
2117 	}
2118 
2119 	switch (st->state) {
2120 	case TCP_SEQ_STATE_LISTENING:
2121 		rc = listening_get_next(seq, v);
2122 		if (!rc) {
2123 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2124 			st->bucket = 0;
2125 			st->offset = 0;
2126 			rc	  = established_get_first(seq);
2127 		}
2128 		break;
2129 	case TCP_SEQ_STATE_ESTABLISHED:
2130 		rc = established_get_next(seq, v);
2131 		break;
2132 	}
2133 out:
2134 	++*pos;
2135 	st->last_pos = *pos;
2136 	return rc;
2137 }
2138 
2139 static void tcp_seq_stop(struct seq_file *seq, void *v)
2140 {
2141 	struct tcp_iter_state *st = seq->private;
2142 
2143 	switch (st->state) {
2144 	case TCP_SEQ_STATE_LISTENING:
2145 		if (v != SEQ_START_TOKEN)
2146 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2147 		break;
2148 	case TCP_SEQ_STATE_ESTABLISHED:
2149 		if (v)
2150 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2151 		break;
2152 	}
2153 }
2154 
2155 int tcp_seq_open(struct inode *inode, struct file *file)
2156 {
2157 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2158 	struct tcp_iter_state *s;
2159 	int err;
2160 
2161 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2162 			  sizeof(struct tcp_iter_state));
2163 	if (err < 0)
2164 		return err;
2165 
2166 	s = ((struct seq_file *)file->private_data)->private;
2167 	s->family		= afinfo->family;
2168 	s->last_pos		= 0;
2169 	return 0;
2170 }
2171 EXPORT_SYMBOL(tcp_seq_open);
2172 
2173 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2174 {
2175 	int rc = 0;
2176 	struct proc_dir_entry *p;
2177 
2178 	afinfo->seq_ops.start		= tcp_seq_start;
2179 	afinfo->seq_ops.next		= tcp_seq_next;
2180 	afinfo->seq_ops.stop		= tcp_seq_stop;
2181 
2182 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2183 			     afinfo->seq_fops, afinfo);
2184 	if (!p)
2185 		rc = -ENOMEM;
2186 	return rc;
2187 }
2188 EXPORT_SYMBOL(tcp_proc_register);
2189 
2190 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2191 {
2192 	remove_proc_entry(afinfo->name, net->proc_net);
2193 }
2194 EXPORT_SYMBOL(tcp_proc_unregister);
2195 
2196 static void get_openreq4(const struct request_sock *req,
2197 			 struct seq_file *f, int i)
2198 {
2199 	const struct inet_request_sock *ireq = inet_rsk(req);
2200 	long delta = req->rsk_timer.expires - jiffies;
2201 
2202 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2203 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2204 		i,
2205 		ireq->ir_loc_addr,
2206 		ireq->ir_num,
2207 		ireq->ir_rmt_addr,
2208 		ntohs(ireq->ir_rmt_port),
2209 		TCP_SYN_RECV,
2210 		0, 0, /* could print option size, but that is af dependent. */
2211 		1,    /* timers active (only the expire timer) */
2212 		jiffies_delta_to_clock_t(delta),
2213 		req->num_timeout,
2214 		from_kuid_munged(seq_user_ns(f),
2215 				 sock_i_uid(req->rsk_listener)),
2216 		0,  /* non standard timer */
2217 		0, /* open_requests have no inode */
2218 		0,
2219 		req);
2220 }
2221 
2222 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2223 {
2224 	int timer_active;
2225 	unsigned long timer_expires;
2226 	const struct tcp_sock *tp = tcp_sk(sk);
2227 	const struct inet_connection_sock *icsk = inet_csk(sk);
2228 	const struct inet_sock *inet = inet_sk(sk);
2229 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2230 	__be32 dest = inet->inet_daddr;
2231 	__be32 src = inet->inet_rcv_saddr;
2232 	__u16 destp = ntohs(inet->inet_dport);
2233 	__u16 srcp = ntohs(inet->inet_sport);
2234 	int rx_queue;
2235 	int state;
2236 
2237 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2238 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2239 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2240 		timer_active	= 1;
2241 		timer_expires	= icsk->icsk_timeout;
2242 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2243 		timer_active	= 4;
2244 		timer_expires	= icsk->icsk_timeout;
2245 	} else if (timer_pending(&sk->sk_timer)) {
2246 		timer_active	= 2;
2247 		timer_expires	= sk->sk_timer.expires;
2248 	} else {
2249 		timer_active	= 0;
2250 		timer_expires = jiffies;
2251 	}
2252 
2253 	state = sk_state_load(sk);
2254 	if (state == TCP_LISTEN)
2255 		rx_queue = sk->sk_ack_backlog;
2256 	else
2257 		/* Because we don't lock the socket,
2258 		 * we might find a transient negative value.
2259 		 */
2260 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2261 
2262 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2263 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2264 		i, src, srcp, dest, destp, state,
2265 		tp->write_seq - tp->snd_una,
2266 		rx_queue,
2267 		timer_active,
2268 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2269 		icsk->icsk_retransmits,
2270 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2271 		icsk->icsk_probes_out,
2272 		sock_i_ino(sk),
2273 		atomic_read(&sk->sk_refcnt), sk,
2274 		jiffies_to_clock_t(icsk->icsk_rto),
2275 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2276 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2277 		tp->snd_cwnd,
2278 		state == TCP_LISTEN ?
2279 		    fastopenq->max_qlen :
2280 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2281 }
2282 
2283 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2284 			       struct seq_file *f, int i)
2285 {
2286 	long delta = tw->tw_timer.expires - jiffies;
2287 	__be32 dest, src;
2288 	__u16 destp, srcp;
2289 
2290 	dest  = tw->tw_daddr;
2291 	src   = tw->tw_rcv_saddr;
2292 	destp = ntohs(tw->tw_dport);
2293 	srcp  = ntohs(tw->tw_sport);
2294 
2295 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2296 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2297 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2298 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2299 		atomic_read(&tw->tw_refcnt), tw);
2300 }
2301 
2302 #define TMPSZ 150
2303 
2304 static int tcp4_seq_show(struct seq_file *seq, void *v)
2305 {
2306 	struct tcp_iter_state *st;
2307 	struct sock *sk = v;
2308 
2309 	seq_setwidth(seq, TMPSZ - 1);
2310 	if (v == SEQ_START_TOKEN) {
2311 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2312 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2313 			   "inode");
2314 		goto out;
2315 	}
2316 	st = seq->private;
2317 
2318 	if (sk->sk_state == TCP_TIME_WAIT)
2319 		get_timewait4_sock(v, seq, st->num);
2320 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2321 		get_openreq4(v, seq, st->num);
2322 	else
2323 		get_tcp4_sock(v, seq, st->num);
2324 out:
2325 	seq_pad(seq, '\n');
2326 	return 0;
2327 }
2328 
2329 static const struct file_operations tcp_afinfo_seq_fops = {
2330 	.owner   = THIS_MODULE,
2331 	.open    = tcp_seq_open,
2332 	.read    = seq_read,
2333 	.llseek  = seq_lseek,
2334 	.release = seq_release_net
2335 };
2336 
2337 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2338 	.name		= "tcp",
2339 	.family		= AF_INET,
2340 	.seq_fops	= &tcp_afinfo_seq_fops,
2341 	.seq_ops	= {
2342 		.show		= tcp4_seq_show,
2343 	},
2344 };
2345 
2346 static int __net_init tcp4_proc_init_net(struct net *net)
2347 {
2348 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2349 }
2350 
2351 static void __net_exit tcp4_proc_exit_net(struct net *net)
2352 {
2353 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2354 }
2355 
2356 static struct pernet_operations tcp4_net_ops = {
2357 	.init = tcp4_proc_init_net,
2358 	.exit = tcp4_proc_exit_net,
2359 };
2360 
2361 int __init tcp4_proc_init(void)
2362 {
2363 	return register_pernet_subsys(&tcp4_net_ops);
2364 }
2365 
2366 void tcp4_proc_exit(void)
2367 {
2368 	unregister_pernet_subsys(&tcp4_net_ops);
2369 }
2370 #endif /* CONFIG_PROC_FS */
2371 
2372 struct proto tcp_prot = {
2373 	.name			= "TCP",
2374 	.owner			= THIS_MODULE,
2375 	.close			= tcp_close,
2376 	.connect		= tcp_v4_connect,
2377 	.disconnect		= tcp_disconnect,
2378 	.accept			= inet_csk_accept,
2379 	.ioctl			= tcp_ioctl,
2380 	.init			= tcp_v4_init_sock,
2381 	.destroy		= tcp_v4_destroy_sock,
2382 	.shutdown		= tcp_shutdown,
2383 	.setsockopt		= tcp_setsockopt,
2384 	.getsockopt		= tcp_getsockopt,
2385 	.keepalive		= tcp_set_keepalive,
2386 	.recvmsg		= tcp_recvmsg,
2387 	.sendmsg		= tcp_sendmsg,
2388 	.sendpage		= tcp_sendpage,
2389 	.backlog_rcv		= tcp_v4_do_rcv,
2390 	.release_cb		= tcp_release_cb,
2391 	.hash			= inet_hash,
2392 	.unhash			= inet_unhash,
2393 	.get_port		= inet_csk_get_port,
2394 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2395 	.stream_memory_free	= tcp_stream_memory_free,
2396 	.sockets_allocated	= &tcp_sockets_allocated,
2397 	.orphan_count		= &tcp_orphan_count,
2398 	.memory_allocated	= &tcp_memory_allocated,
2399 	.memory_pressure	= &tcp_memory_pressure,
2400 	.sysctl_mem		= sysctl_tcp_mem,
2401 	.sysctl_wmem		= sysctl_tcp_wmem,
2402 	.sysctl_rmem		= sysctl_tcp_rmem,
2403 	.max_header		= MAX_TCP_HEADER,
2404 	.obj_size		= sizeof(struct tcp_sock),
2405 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2406 	.twsk_prot		= &tcp_timewait_sock_ops,
2407 	.rsk_prot		= &tcp_request_sock_ops,
2408 	.h.hashinfo		= &tcp_hashinfo,
2409 	.no_autobind		= true,
2410 #ifdef CONFIG_COMPAT
2411 	.compat_setsockopt	= compat_tcp_setsockopt,
2412 	.compat_getsockopt	= compat_tcp_getsockopt,
2413 #endif
2414 	.diag_destroy		= tcp_abort,
2415 };
2416 EXPORT_SYMBOL(tcp_prot);
2417 
2418 static void __net_exit tcp_sk_exit(struct net *net)
2419 {
2420 	int cpu;
2421 
2422 	for_each_possible_cpu(cpu)
2423 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2424 	free_percpu(net->ipv4.tcp_sk);
2425 }
2426 
2427 static int __net_init tcp_sk_init(struct net *net)
2428 {
2429 	int res, cpu, cnt;
2430 
2431 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2432 	if (!net->ipv4.tcp_sk)
2433 		return -ENOMEM;
2434 
2435 	for_each_possible_cpu(cpu) {
2436 		struct sock *sk;
2437 
2438 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2439 					   IPPROTO_TCP, net);
2440 		if (res)
2441 			goto fail;
2442 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2443 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2444 	}
2445 
2446 	net->ipv4.sysctl_tcp_ecn = 2;
2447 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2448 
2449 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2450 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2451 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2452 
2453 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2454 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2455 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2456 
2457 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2458 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2459 	net->ipv4.sysctl_tcp_syncookies = 1;
2460 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2461 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2462 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2463 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2464 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2465 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2466 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2467 
2468 	cnt = tcp_hashinfo.ehash_mask + 1;
2469 	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2470 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2471 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2472 
2473 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2474 
2475 	return 0;
2476 fail:
2477 	tcp_sk_exit(net);
2478 
2479 	return res;
2480 }
2481 
2482 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2483 {
2484 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2485 }
2486 
2487 static struct pernet_operations __net_initdata tcp_sk_ops = {
2488        .init	   = tcp_sk_init,
2489        .exit	   = tcp_sk_exit,
2490        .exit_batch = tcp_sk_exit_batch,
2491 };
2492 
2493 void __init tcp_v4_init(void)
2494 {
2495 	if (register_pernet_subsys(&tcp_sk_ops))
2496 		panic("Failed to create the TCP control socket.\n");
2497 }
2498